はじまりの大地
このコミットが含まれているのは:
@@ -0,0 +1,63 @@
|
||||
# Transcription DevTools
|
||||
|
||||
Includes:
|
||||
* __JiWER__ CLI NodeJS wrapper
|
||||
* Benchmark tool to test multiple transcription engines
|
||||
* TypeScript classes to evaluate word-error-rate of files generated by the transcription
|
||||
|
||||
## Build
|
||||
|
||||
```sh
|
||||
npm run build
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
|
||||
A benchmark of available __transcribers__ might be run with:
|
||||
```sh
|
||||
npm run benchmark
|
||||
```
|
||||
```
|
||||
┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐
|
||||
│ (index) │ WER │ CER │ duration │ model │ engine │
|
||||
├────────────────────────┼───────────────────────┼───────────────────────┼──────────┼────────┼───────────────────────┤
|
||||
│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%' │ '9.62457337883959%' │ '41s' │ 'tiny' │ 'openai-whisper' │
|
||||
│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%' │ '10.46195652173913%' │ '15s' │ 'tiny' │ 'whisper-ctranslate2' │
|
||||
└────────────────────────┴───────────────────────┴───────────────────────┴──────────┴────────┴───────────────────────┘
|
||||
```
|
||||
|
||||
The benchmark may be run with multiple model builtin sizes:
|
||||
|
||||
```sh
|
||||
MODELS=tiny,small,large npm run benchmark
|
||||
```
|
||||
|
||||
## Jiwer
|
||||
|
||||
> *JiWER is a python tool for computing the word-error-rate of ASR systems.*
|
||||
> https://jitsi.github.io/jiwer/cli/
|
||||
|
||||
__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files:
|
||||
- WER (Word Error Rate)
|
||||
- CER (Character Error Rate)
|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
```typescript
|
||||
const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt')
|
||||
|
||||
// WER as a percentage, ex: 0.03 -> 3%
|
||||
console.log(await jiwerCLI.wer())
|
||||
|
||||
// CER as a percentage: 0.01 -> 1%
|
||||
console.log(await jiwerCLI.cer())
|
||||
|
||||
// Detailed comparison report
|
||||
console.log(await jiwerCLI.alignment())
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- https://jitsi.github.io/jiwer/
|
||||
- https://github.com/rapidfuzz/RapidFuzz
|
||||
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"name": "@peertube/peertube-transcription-devtools",
|
||||
"private": true,
|
||||
"version": "0.0.0",
|
||||
"main": "dist/index.js",
|
||||
"files": [ "dist" ],
|
||||
"exports": {
|
||||
"types": "./dist/index.d.ts",
|
||||
"peertube:tsx": "./src/index.ts",
|
||||
"default": "./dist/index.js"
|
||||
},
|
||||
"type": "module",
|
||||
"devDependencies": {},
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"watch": "tsc -w",
|
||||
"benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts"
|
||||
},
|
||||
"dependencies": {}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
jiwer
|
||||
@@ -0,0 +1,142 @@
|
||||
import { millisecondsToTime } from '@peertube/peertube-core-utils'
|
||||
import { SUUID, buildAbsoluteFixturePath, buildSUUID } from '@peertube/peertube-node-utils'
|
||||
import {
|
||||
TranscriptFile,
|
||||
TranscriptionEngine,
|
||||
TranscriptionEngineName,
|
||||
TranscriptionModel,
|
||||
transcriberFactory
|
||||
} from '@peertube/peertube-transcription'
|
||||
import { ensureDir, remove } from 'fs-extra/esm'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { PerformanceObserver, performance } from 'node:perf_hooks'
|
||||
import { createLogger, format, transports } from 'winston'
|
||||
import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
|
||||
|
||||
interface BenchmarkResult {
|
||||
uuid: SUUID
|
||||
WER?: number
|
||||
CER?: number
|
||||
duration?: number
|
||||
engine?: TranscriptionEngine
|
||||
model?: string
|
||||
}
|
||||
|
||||
type Benchmark = Record<SUUID, BenchmarkResult>
|
||||
|
||||
const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({
|
||||
...benchmark,
|
||||
[benchmarkResult.uuid]: {
|
||||
...benchmark[benchmarkResult.uuid],
|
||||
...benchmarkResult
|
||||
}
|
||||
})
|
||||
|
||||
const groupBenchmarkResultsByModel = (benchmarkResults: Record<string, BenchmarkResult>) => (benchmarksGroupedByModel, uuid) => ({
|
||||
...benchmarksGroupedByModel,
|
||||
[benchmarkResults[uuid].model]: {
|
||||
...benchmarksGroupedByModel[benchmarkResults[uuid].model],
|
||||
[uuid]: formatBenchmarkResult(benchmarkResults[uuid])
|
||||
}
|
||||
})
|
||||
|
||||
interface FormattedBenchmarkResult {
|
||||
WER?: string
|
||||
CER?: string
|
||||
duration?: string
|
||||
model?: string
|
||||
engine?: string
|
||||
}
|
||||
|
||||
const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial<BenchmarkResult>): FormattedBenchmarkResult => ({
|
||||
WER: WER ? `${WER * 100}%` : undefined,
|
||||
CER: CER ? `${CER * 100}%` : undefined,
|
||||
duration: duration ? millisecondsToTime(duration) : undefined,
|
||||
model,
|
||||
engine: engine.name
|
||||
})
|
||||
|
||||
void (async () => {
|
||||
const logger = createLogger()
|
||||
logger.add(new transports.Console({ format: format.printf(log => log.message) }))
|
||||
|
||||
const transcribers: TranscriptionEngineName[] = [ 'openai-whisper', 'whisper-ctranslate2' ]
|
||||
const models = process.env.MODELS
|
||||
? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName)
|
||||
: [ 'tiny' ]
|
||||
|
||||
const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark')
|
||||
const pipDirectory = join(tmpdir(), 'peertube-transcription', 'pip')
|
||||
|
||||
const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
|
||||
const referenceTranscriptFile = new TranscriptFile({
|
||||
path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
})
|
||||
|
||||
let benchmarkResults: Record<string, BenchmarkResult> = {}
|
||||
|
||||
// before
|
||||
await ensureDir(transcriptDirectory)
|
||||
const performanceObserver = new PerformanceObserver((items) => {
|
||||
items
|
||||
.getEntries()
|
||||
.forEach((entry) => {
|
||||
benchmarkResults = benchmarkReducer(benchmarkResults, {
|
||||
uuid: entry.name as SUUID,
|
||||
duration: entry.duration
|
||||
})
|
||||
})
|
||||
})
|
||||
performanceObserver.observe({ type: 'measure' })
|
||||
|
||||
// benchmark
|
||||
logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`)
|
||||
for (const transcriberName of transcribers) {
|
||||
logger.info(`Create "${transcriberName}" transcriber for the benchmark...`)
|
||||
|
||||
const transcriber = transcriberFactory.createFromEngineName({
|
||||
engineName: transcriberName,
|
||||
logger: createLogger({ transports: [ new transports.Console() ] }),
|
||||
binDirectory: join(pipDirectory, 'bin')
|
||||
})
|
||||
|
||||
await transcriber.install(pipDirectory)
|
||||
|
||||
for (const modelName of models) {
|
||||
logger.info(`Run benchmark with "${modelName}" model:`)
|
||||
const model = new TranscriptionModel(modelName)
|
||||
const uuid = buildSUUID()
|
||||
const transcriptFile = await transcriber.transcribe({
|
||||
mediaFilePath,
|
||||
model,
|
||||
transcriptDirectory,
|
||||
language: 'fr',
|
||||
format: 'txt',
|
||||
runId: uuid
|
||||
})
|
||||
const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
|
||||
await new Promise(resolve => setTimeout(resolve, 1))
|
||||
|
||||
benchmarkResults = benchmarkReducer(benchmarkResults, {
|
||||
uuid,
|
||||
engine: transcriber.engine,
|
||||
WER: await evaluator.wer(),
|
||||
CER: await evaluator.cer(),
|
||||
model: model.name
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// display
|
||||
const benchmarkResultsGroupedByModel = Object
|
||||
.keys(benchmarkResults)
|
||||
.reduce(groupBenchmarkResultsByModel(benchmarkResults), {})
|
||||
Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark))
|
||||
|
||||
// after
|
||||
await remove(transcriptDirectory)
|
||||
performance.clearMarks()
|
||||
})()
|
||||
@@ -0,0 +1,5 @@
|
||||
export * from './jiwer-cli.js'
|
||||
export * from './levenshtein.js'
|
||||
export * from './transcript-file-evaluator-interface.js'
|
||||
export * from './transcript-file-evaluator.js'
|
||||
export * from './utils.js'
|
||||
@@ -0,0 +1,69 @@
|
||||
import { $ } from 'execa'
|
||||
|
||||
export class JiwerClI {
|
||||
referenceFilePath: string
|
||||
hypothesisFilePath: string
|
||||
|
||||
constructor (referenceFilePath: string, hypothesisFilePath: string) {
|
||||
this.referenceFilePath = referenceFilePath
|
||||
this.hypothesisFilePath = hypothesisFilePath
|
||||
}
|
||||
|
||||
/**
|
||||
* @param referenceFilePath Path to new-line delimited text file of reference sentences.
|
||||
* @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences.
|
||||
* @param args
|
||||
*/
|
||||
static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) {
|
||||
return [
|
||||
'--reference',
|
||||
referenceFilePath,
|
||||
'--hypothesis',
|
||||
hypothesisFilePath,
|
||||
...args
|
||||
]
|
||||
}
|
||||
|
||||
buildArgs (...args: string[]) {
|
||||
return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args)
|
||||
}
|
||||
|
||||
/**
|
||||
* WER: Word Error Rate as a percentage, ex: 0.03 -> 3%
|
||||
*/
|
||||
static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
|
||||
const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}`
|
||||
|
||||
return Number(wer)
|
||||
}
|
||||
|
||||
async wer (global = true) {
|
||||
return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global)
|
||||
}
|
||||
|
||||
/**
|
||||
* CER: Character Error Rate
|
||||
*/
|
||||
static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
|
||||
const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}`
|
||||
|
||||
return Number(cer)
|
||||
}
|
||||
|
||||
async cer (global = true) {
|
||||
return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global)
|
||||
}
|
||||
|
||||
/**
|
||||
* Print alignment of each sentence.
|
||||
*/
|
||||
static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<string> {
|
||||
const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}`
|
||||
|
||||
return alignment
|
||||
}
|
||||
|
||||
async alignment (global = true) {
|
||||
return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
|
||||
return d0 < d1 || d2 < d1
|
||||
? d0 > d2
|
||||
? d2 + 1
|
||||
: d0 + 1
|
||||
: bx === ay
|
||||
? d1
|
||||
: d1 + 1
|
||||
}
|
||||
|
||||
/**
|
||||
* @see https://github.com/gustf/js-levenshtein
|
||||
*/
|
||||
export function levenshteinDistance (a: string, b: string): number {
|
||||
if (a === b) {
|
||||
return 0
|
||||
}
|
||||
|
||||
if (a.length > b.length) {
|
||||
const tmp = a
|
||||
a = b
|
||||
b = tmp
|
||||
}
|
||||
|
||||
let la = a.length
|
||||
let lb = b.length
|
||||
|
||||
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
|
||||
la--
|
||||
lb--
|
||||
}
|
||||
|
||||
let offset = 0
|
||||
|
||||
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
|
||||
offset++
|
||||
}
|
||||
|
||||
la -= offset
|
||||
lb -= offset
|
||||
|
||||
if (la === 0 || lb < 3) {
|
||||
return lb
|
||||
}
|
||||
|
||||
let x = 0
|
||||
let y: number
|
||||
let d0: number
|
||||
let d1: number
|
||||
let d2: number
|
||||
let d3: number
|
||||
let dd: number
|
||||
let dy: number
|
||||
let ay: number
|
||||
let bx0: number
|
||||
let bx1: number
|
||||
let bx2: number
|
||||
let bx3: number
|
||||
|
||||
const vector: number[] = []
|
||||
|
||||
for (y = 0; y < la; y++) {
|
||||
vector.push(y + 1)
|
||||
vector.push(a.charCodeAt(offset + y))
|
||||
}
|
||||
|
||||
const len = vector.length - 1
|
||||
|
||||
for (; x < lb - 3;) {
|
||||
bx0 = b.charCodeAt(offset + (d0 = x))
|
||||
bx1 = b.charCodeAt(offset + (d1 = x + 1))
|
||||
bx2 = b.charCodeAt(offset + (d2 = x + 2))
|
||||
bx3 = b.charCodeAt(offset + (d3 = x + 3))
|
||||
dd = (x += 4)
|
||||
for (y = 0; y < len; y += 2) {
|
||||
dy = vector[y]
|
||||
ay = vector[y + 1]
|
||||
d0 = min(dy, d0, d1, bx0, ay)
|
||||
d1 = min(d0, d1, d2, bx1, ay)
|
||||
d2 = min(d1, d2, d3, bx2, ay)
|
||||
dd = min(d2, d3, dd, bx3, ay)
|
||||
vector[y] = dd
|
||||
d3 = d2
|
||||
d2 = d1
|
||||
d1 = d0
|
||||
d0 = dy
|
||||
}
|
||||
}
|
||||
|
||||
for (; x < lb;) {
|
||||
bx0 = b.charCodeAt(offset + (d0 = x))
|
||||
dd = ++x
|
||||
for (y = 0; y < len; y += 2) {
|
||||
dy = vector[y]
|
||||
vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
|
||||
d0 = dy
|
||||
}
|
||||
}
|
||||
|
||||
return dd
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
export interface TranscriptFileEvaluation {
|
||||
wer: number
|
||||
cer: number
|
||||
alignment: string
|
||||
}
|
||||
|
||||
export interface TranscriptFileEvaluatorInterface {
|
||||
wer(): Promise<number>
|
||||
cer(): Promise<number>
|
||||
alignment(): Promise<string>
|
||||
evaluate(): Promise<TranscriptFileEvaluation>
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
import assert from 'node:assert'
|
||||
import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js'
|
||||
import { TranscriptFile } from '@peertube/peertube-transcription'
|
||||
import { JiwerClI } from './jiwer-cli.js'
|
||||
|
||||
export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface {
|
||||
referenceTranscriptFile: TranscriptFile
|
||||
hypothesisTranscriptFile: TranscriptFile
|
||||
jiwerCLI: JiwerClI
|
||||
|
||||
constructor (referenceTranscriptFile: TranscriptFile, hypothesisTranscriptFile: TranscriptFile) {
|
||||
assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
|
||||
assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
|
||||
|
||||
this.referenceTranscriptFile = referenceTranscriptFile
|
||||
this.hypothesisTranscriptFile = hypothesisTranscriptFile
|
||||
|
||||
this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path)
|
||||
}
|
||||
|
||||
/**
|
||||
* WER: Word Error Rate
|
||||
*/
|
||||
wer () {
|
||||
return this.jiwerCLI.wer()
|
||||
}
|
||||
|
||||
/**
|
||||
* CER: Character Error Rate
|
||||
*/
|
||||
cer () {
|
||||
return this.jiwerCLI.cer()
|
||||
}
|
||||
|
||||
alignment () {
|
||||
return this.jiwerCLI.alignment()
|
||||
}
|
||||
|
||||
async evaluate () {
|
||||
return {
|
||||
wer: await this.wer(),
|
||||
cer: await this.cer(),
|
||||
alignment: await this.alignment()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
import { join, parse } from 'node:path'
|
||||
import { createWriteStream } from 'node:fs'
|
||||
import { lstat, unlink } from 'node:fs/promises'
|
||||
import assert from 'node:assert'
|
||||
import { $ } from 'execa'
|
||||
import { makeFileRequest } from '@peertube/peertube-server-commands'
|
||||
|
||||
export const downloadFile = async (url: string, targetDirectory: string) => {
|
||||
const { base } = parse(url)
|
||||
const filePath = join(targetDirectory, base)
|
||||
|
||||
const fileStream = createWriteStream(filePath)
|
||||
const stream = makeFileRequest(url).pipe(fileStream)
|
||||
|
||||
return await new Promise((resolve: (filePath: string) => void, reject) => {
|
||||
stream.on('finish', () => resolve(filePath))
|
||||
stream.on('error', async e => {
|
||||
fileStream.close()
|
||||
await unlink(filePath)
|
||||
reject(e.message)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
export const unzip = async (zipFilePath: string) => {
|
||||
assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`)
|
||||
const { dir, name } = parse(zipFilePath)
|
||||
|
||||
await $`unzip -o ${zipFilePath} -d ${dir}`
|
||||
|
||||
return join(dir, name)
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"extends": "../../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "./dist",
|
||||
"rootDir": "src",
|
||||
"tsBuildInfoFile": "./dist/.tsbuildinfo",
|
||||
},
|
||||
"references": [
|
||||
{ "path": "../transcription" }
|
||||
]
|
||||
}
|
||||
新しい課題から参照
ユーザをブロックする