はじまりの大地

このコミットが含まれているのは:
2024-07-15 09:14:04 +09:00
コミット 6632905f32
3501個のファイルの変更1439465行の追加0行の削除
+63
ファイルの表示
@@ -0,0 +1,63 @@
# Transcription DevTools
Includes:
* __JiWER__ CLI NodeJS wrapper
* Benchmark tool to test multiple transcription engines
* TypeScript classes to evaluate word-error-rate of files generated by the transcription
## Build
```sh
npm run build
```
## Benchmark
A benchmark of available __transcribers__ might be run with:
```sh
npm run benchmark
```
```
┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐
│ (index) │ WER │ CER │ duration │ model │ engine │
├────────────────────────┼───────────────────────┼───────────────────────┼──────────┼────────┼───────────────────────┤
│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%' │ '9.62457337883959%' │ '41s' │ 'tiny' │ 'openai-whisper' │
│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%' │ '10.46195652173913%' │ '15s' │ 'tiny' │ 'whisper-ctranslate2' │
└────────────────────────┴───────────────────────┴───────────────────────┴──────────┴────────┴───────────────────────┘
```
The benchmark may be run with multiple model builtin sizes:
```sh
MODELS=tiny,small,large npm run benchmark
```
## Jiwer
> *JiWER is a python tool for computing the word-error-rate of ASR systems.*
> https://jitsi.github.io/jiwer/cli/
__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files:
- WER (Word Error Rate)
- CER (Character Error Rate)
### Usage
```typescript
const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt')
// WER as a percentage, ex: 0.03 -> 3%
console.log(await jiwerCLI.wer())
// CER as a percentage: 0.01 -> 1%
console.log(await jiwerCLI.cer())
// Detailed comparison report
console.log(await jiwerCLI.alignment())
```
## Resources
- https://jitsi.github.io/jiwer/
- https://github.com/rapidfuzz/RapidFuzz
+20
ファイルの表示
@@ -0,0 +1,20 @@
{
"name": "@peertube/peertube-transcription-devtools",
"private": true,
"version": "0.0.0",
"main": "dist/index.js",
"files": [ "dist" ],
"exports": {
"types": "./dist/index.d.ts",
"peertube:tsx": "./src/index.ts",
"default": "./dist/index.js"
},
"type": "module",
"devDependencies": {},
"scripts": {
"build": "tsc",
"watch": "tsc -w",
"benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts"
},
"dependencies": {}
}
+1
ファイルの表示
@@ -0,0 +1 @@
jiwer
+142
ファイルの表示
@@ -0,0 +1,142 @@
import { millisecondsToTime } from '@peertube/peertube-core-utils'
import { SUUID, buildAbsoluteFixturePath, buildSUUID } from '@peertube/peertube-node-utils'
import {
TranscriptFile,
TranscriptionEngine,
TranscriptionEngineName,
TranscriptionModel,
transcriberFactory
} from '@peertube/peertube-transcription'
import { ensureDir, remove } from 'fs-extra/esm'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { PerformanceObserver, performance } from 'node:perf_hooks'
import { createLogger, format, transports } from 'winston'
import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
interface BenchmarkResult {
uuid: SUUID
WER?: number
CER?: number
duration?: number
engine?: TranscriptionEngine
model?: string
}
type Benchmark = Record<SUUID, BenchmarkResult>
const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({
...benchmark,
[benchmarkResult.uuid]: {
...benchmark[benchmarkResult.uuid],
...benchmarkResult
}
})
const groupBenchmarkResultsByModel = (benchmarkResults: Record<string, BenchmarkResult>) => (benchmarksGroupedByModel, uuid) => ({
...benchmarksGroupedByModel,
[benchmarkResults[uuid].model]: {
...benchmarksGroupedByModel[benchmarkResults[uuid].model],
[uuid]: formatBenchmarkResult(benchmarkResults[uuid])
}
})
interface FormattedBenchmarkResult {
WER?: string
CER?: string
duration?: string
model?: string
engine?: string
}
const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial<BenchmarkResult>): FormattedBenchmarkResult => ({
WER: WER ? `${WER * 100}%` : undefined,
CER: CER ? `${CER * 100}%` : undefined,
duration: duration ? millisecondsToTime(duration) : undefined,
model,
engine: engine.name
})
void (async () => {
const logger = createLogger()
logger.add(new transports.Console({ format: format.printf(log => log.message) }))
const transcribers: TranscriptionEngineName[] = [ 'openai-whisper', 'whisper-ctranslate2' ]
const models = process.env.MODELS
? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName)
: [ 'tiny' ]
const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark')
const pipDirectory = join(tmpdir(), 'peertube-transcription', 'pip')
const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const referenceTranscriptFile = new TranscriptFile({
path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
language: 'fr',
format: 'txt'
})
let benchmarkResults: Record<string, BenchmarkResult> = {}
// before
await ensureDir(transcriptDirectory)
const performanceObserver = new PerformanceObserver((items) => {
items
.getEntries()
.forEach((entry) => {
benchmarkResults = benchmarkReducer(benchmarkResults, {
uuid: entry.name as SUUID,
duration: entry.duration
})
})
})
performanceObserver.observe({ type: 'measure' })
// benchmark
logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`)
for (const transcriberName of transcribers) {
logger.info(`Create "${transcriberName}" transcriber for the benchmark...`)
const transcriber = transcriberFactory.createFromEngineName({
engineName: transcriberName,
logger: createLogger({ transports: [ new transports.Console() ] }),
binDirectory: join(pipDirectory, 'bin')
})
await transcriber.install(pipDirectory)
for (const modelName of models) {
logger.info(`Run benchmark with "${modelName}" model:`)
const model = new TranscriptionModel(modelName)
const uuid = buildSUUID()
const transcriptFile = await transcriber.transcribe({
mediaFilePath,
model,
transcriptDirectory,
language: 'fr',
format: 'txt',
runId: uuid
})
const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
await new Promise(resolve => setTimeout(resolve, 1))
benchmarkResults = benchmarkReducer(benchmarkResults, {
uuid,
engine: transcriber.engine,
WER: await evaluator.wer(),
CER: await evaluator.cer(),
model: model.name
})
}
}
// display
const benchmarkResultsGroupedByModel = Object
.keys(benchmarkResults)
.reduce(groupBenchmarkResultsByModel(benchmarkResults), {})
Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark))
// after
await remove(transcriptDirectory)
performance.clearMarks()
})()
+5
ファイルの表示
@@ -0,0 +1,5 @@
export * from './jiwer-cli.js'
export * from './levenshtein.js'
export * from './transcript-file-evaluator-interface.js'
export * from './transcript-file-evaluator.js'
export * from './utils.js'
+69
ファイルの表示
@@ -0,0 +1,69 @@
import { $ } from 'execa'
export class JiwerClI {
referenceFilePath: string
hypothesisFilePath: string
constructor (referenceFilePath: string, hypothesisFilePath: string) {
this.referenceFilePath = referenceFilePath
this.hypothesisFilePath = hypothesisFilePath
}
/**
* @param referenceFilePath Path to new-line delimited text file of reference sentences.
* @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences.
* @param args
*/
static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) {
return [
'--reference',
referenceFilePath,
'--hypothesis',
hypothesisFilePath,
...args
]
}
buildArgs (...args: string[]) {
return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args)
}
/**
* WER: Word Error Rate as a percentage, ex: 0.03 -> 3%
*/
static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}`
return Number(wer)
}
async wer (global = true) {
return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global)
}
/**
* CER: Character Error Rate
*/
static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}`
return Number(cer)
}
async cer (global = true) {
return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global)
}
/**
* Print alignment of each sentence.
*/
static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<string> {
const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}`
return alignment
}
async alignment (global = true) {
return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global)
}
}
+101
ファイルの表示
@@ -0,0 +1,101 @@
function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
return d0 < d1 || d2 < d1
? d0 > d2
? d2 + 1
: d0 + 1
: bx === ay
? d1
: d1 + 1
}
/**
* @see https://github.com/gustf/js-levenshtein
*/
export function levenshteinDistance (a: string, b: string): number {
if (a === b) {
return 0
}
if (a.length > b.length) {
const tmp = a
a = b
b = tmp
}
let la = a.length
let lb = b.length
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
la--
lb--
}
let offset = 0
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
offset++
}
la -= offset
lb -= offset
if (la === 0 || lb < 3) {
return lb
}
let x = 0
let y: number
let d0: number
let d1: number
let d2: number
let d3: number
let dd: number
let dy: number
let ay: number
let bx0: number
let bx1: number
let bx2: number
let bx3: number
const vector: number[] = []
for (y = 0; y < la; y++) {
vector.push(y + 1)
vector.push(a.charCodeAt(offset + y))
}
const len = vector.length - 1
for (; x < lb - 3;) {
bx0 = b.charCodeAt(offset + (d0 = x))
bx1 = b.charCodeAt(offset + (d1 = x + 1))
bx2 = b.charCodeAt(offset + (d2 = x + 2))
bx3 = b.charCodeAt(offset + (d3 = x + 3))
dd = (x += 4)
for (y = 0; y < len; y += 2) {
dy = vector[y]
ay = vector[y + 1]
d0 = min(dy, d0, d1, bx0, ay)
d1 = min(d0, d1, d2, bx1, ay)
d2 = min(d1, d2, d3, bx2, ay)
dd = min(d2, d3, dd, bx3, ay)
vector[y] = dd
d3 = d2
d2 = d1
d1 = d0
d0 = dy
}
}
for (; x < lb;) {
bx0 = b.charCodeAt(offset + (d0 = x))
dd = ++x
for (y = 0; y < len; y += 2) {
dy = vector[y]
vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
d0 = dy
}
}
return dd
}
+12
ファイルの表示
@@ -0,0 +1,12 @@
export interface TranscriptFileEvaluation {
wer: number
cer: number
alignment: string
}
export interface TranscriptFileEvaluatorInterface {
wer(): Promise<number>
cer(): Promise<number>
alignment(): Promise<string>
evaluate(): Promise<TranscriptFileEvaluation>
}
+46
ファイルの表示
@@ -0,0 +1,46 @@
import assert from 'node:assert'
import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js'
import { TranscriptFile } from '@peertube/peertube-transcription'
import { JiwerClI } from './jiwer-cli.js'
export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface {
referenceTranscriptFile: TranscriptFile
hypothesisTranscriptFile: TranscriptFile
jiwerCLI: JiwerClI
constructor (referenceTranscriptFile: TranscriptFile, hypothesisTranscriptFile: TranscriptFile) {
assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
this.referenceTranscriptFile = referenceTranscriptFile
this.hypothesisTranscriptFile = hypothesisTranscriptFile
this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path)
}
/**
* WER: Word Error Rate
*/
wer () {
return this.jiwerCLI.wer()
}
/**
* CER: Character Error Rate
*/
cer () {
return this.jiwerCLI.cer()
}
alignment () {
return this.jiwerCLI.alignment()
}
async evaluate () {
return {
wer: await this.wer(),
cer: await this.cer(),
alignment: await this.alignment()
}
}
}
+32
ファイルの表示
@@ -0,0 +1,32 @@
import { join, parse } from 'node:path'
import { createWriteStream } from 'node:fs'
import { lstat, unlink } from 'node:fs/promises'
import assert from 'node:assert'
import { $ } from 'execa'
import { makeFileRequest } from '@peertube/peertube-server-commands'
export const downloadFile = async (url: string, targetDirectory: string) => {
const { base } = parse(url)
const filePath = join(targetDirectory, base)
const fileStream = createWriteStream(filePath)
const stream = makeFileRequest(url).pipe(fileStream)
return await new Promise((resolve: (filePath: string) => void, reject) => {
stream.on('finish', () => resolve(filePath))
stream.on('error', async e => {
fileStream.close()
await unlink(filePath)
reject(e.message)
})
})
}
export const unzip = async (zipFilePath: string) => {
assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`)
const { dir, name } = parse(zipFilePath)
await $`unzip -o ${zipFilePath} -d ${dir}`
return join(dir, name)
}
+11
ファイルの表示
@@ -0,0 +1,11 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"outDir": "./dist",
"rootDir": "src",
"tsBuildInfoFile": "./dist/.tsbuildinfo",
},
"references": [
{ "path": "../transcription" }
]
}