-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
160 lines (134 loc) · 5.73 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import logging
import os
import subprocess
import time
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import soundfile as sf
import whisper
from datasets import Audio, load_dataset
from jiwer import wer
from tqdm import tqdm
# Automatically download the english test subset of Mozilla commonvoice
# This will save somewhere to disk - identify the path and use this path as our directory from which we provide examples to whisper.cpp
# Model card: https://huggingface.co/datasets/fsicoli/common_voice_17_0
# TODO: Make sure this is coming in as 16k sample rate
cv_17 = load_dataset(
"mozilla-foundation/common_voice_17_0",
"en",
split="test",
cache_dir="./dataset_cache",
streaming=True,
token=True
)
cv_17 = cv_17.cast_column("audio", Audio(sampling_rate=16000))
model_list = [
"tiny.en",
"base.en",
"small.en",
"medium.en"
]
param_count = [39, 74, 244, 768]
"""
Will Richards, Oregon State University, 2024
Abstraction layer for automated speech recognition (ASR) of recorded audio
"""
class WhisperTranscriber:
def __init__(self, model: str) -> None:
self.model = whisper.load_model(model)
def transcribe(self, inputFile: str):
return self.model.transcribe(inputFile)["text"]
class AudioTranscriber:
def __init__(self, model="base.en-q5_0"):
self.modelPath = f"whisper.cpp/models/ggml-{model}.bin"
def transcribe(self, inputFile: str):
full_command = f"whisper.cpp/main -m {self.modelPath} -f {inputFile} -np -nt"
process = subprocess.Popen(
full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
# Get the output and error (if any)
output, error = process.communicate()
if error:
logging.error(f"Error proccessing audio: {error.decode('utf-8')}")
# Process and return the output string
decoded_str = output.decode("utf-8").strip()
processed_str = decoded_str.replace("[BLANK_AUDIO]", "").strip()
return processed_str
# TODO: randomly sample the test subset... because there are too many damn samples
def compile_whisper_cpp() -> None:
# First, make sure we've compiled the quantization tool
if not os.path.exists("whisper.cpp/main"):
cmd = "cd whisper.cpp && make -j"
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
print(output)
if not os.path.exists("whisper.cpp/quantize"):
cmd = "cd whisper.cpp && make -j quantize"
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
print(output)
# For every model, compile it, then quantize it
for model in model_list:
_compile_whisper_model(model)
_quantize_whisper_model(model)
def _compile_whisper_model(model: str) -> None:
if not os.path.exists(f"whisper.cpp/models/ggml-{model}.bin"):
cmd = f"cd whisper.cpp && make -j {model}"
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
print(output)
def _quantize_whisper_model(model: str) -> None:
if not os.path.exists(f"whisper.cpp/models/ggml-{model}-q5_0.bin"):
cmd = f"cd whisper.cpp && quantize models/ggml-{model}.bin models/ggml-{model}-q5_0.bin q5_0"
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
print(output)
def main():
# Ensure all model binaries are compiled... will skip if already available
compile_whisper_cpp()
model_dict = {x: model_list for x in ['python', 'cpp', 'q5_0']}
results = defaultdict(dict)
for arch_type, models in model_dict.items():
for model in models:
if not os.path.exists(f"{arch_type}_{model}.csv"):
result_df = test_transcription(arch_type, model)
result_df.to_csv(f"{arch_type}_{model}.csv")
else:
print("Experiment already ran, loading...")
result_df = pd.read_csv(f"{arch_type}_{model}.csv")
results[arch_type][model] = result_df
avg_wer = result_df['WER'].mean()
avg_runtime = result_df['runtime'].mean()
print(f"{arch_type}: Average WER for {model}: {avg_wer}")
print(f"{arch_type}: Average runtime for {model}: {avg_runtime}")
def test_transcription(arch_type: str, model: str) -> pd.DataFrame:
match arch_type:
case 'python':
asr = WhisperTranscriber(model=model)
case 'q5_0':
asr = AudioTranscriber(model=f"{model}-q5_0")
case 'cpp':
asr = AudioTranscriber(model=model)
case _:
raise Exception("Invalid architecture type found")
records = []
for sample in tqdm(cv_17.take(1000), total=1000):
# Write sample to file
basename = os.path.splitext(sample['path'])[0]
sample_path = f"{basename}.wav"
if not os.path.exists(sample_path):
sf.write(sample_path, sample['audio']['array'], sample['audio']['sampling_rate'])
# Dispatch file to ASR model for testing
time_initial = time.time()
pred = asr.transcribe(sample_path)
time_final = time.time()
# Compute WER between prediction and actual
err = wer(sample['sentence'], pred)
time_delta = time_final - time_initial
records.append({"sample": sample['sentence'], "prediction": pred, "WER": err, "runtime": time_delta})
df = pd.DataFrame.from_records(records)
df.name = model
return df
if __name__ == "__main__":
main()