Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

whisper : add support for large v3 #1444

Merged
merged 4 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -417,9 +417,10 @@ samples:
.PHONY: medium.en
.PHONY: medium
.PHONY: large-v1
.PHONY: large-v2
.PHONY: large

tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
bash ./models/download-ggml-model.sh $@
@echo ""
@echo "==============================================="
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ make small
make medium.en
make medium
make large-v1
make large-v2
make large
```

Expand All @@ -245,7 +246,7 @@ make large
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |

## Quantization

Expand Down
2 changes: 1 addition & 1 deletion bindings/go/examples/go-model-download/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ const (

var (
// The models which will be downloaded, if no model is specified as an argument
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
)

var (
Expand Down
1 change: 0 additions & 1 deletion bindings/go/whisper.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ const (
SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
NumFFT = C.WHISPER_N_FFT
NumMEL = C.WHISPER_N_MEL
HopLength = C.WHISPER_HOP_LENGTH
ChunkSize = C.WHISPER_CHUNK_SIZE
)
Expand Down
4 changes: 3 additions & 1 deletion examples/bench.wasm/emscripten.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ void bench_main(size_t index) {

fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);

if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
const int n_mels = whisper_model_n_mels(ctx);

if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return;
}
Expand Down
4 changes: 3 additions & 1 deletion examples/bench/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ int whisper_bench_full(const whisper_params & params) {
return 2;
}

if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
const int n_mels = whisper_model_n_mels(ctx);

if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return 3;
}
Expand Down
2 changes: 1 addition & 1 deletion examples/livestream.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ if [ -n "$3" ]; then
fi

# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )

# list available models
function list_models {
Expand Down
2 changes: 1 addition & 1 deletion examples/twitch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ help()
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
echo "options:"
echo "-s Step in seconds (default is $step)."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
echo "-t Number of threads to use."
echo "-h Print this help page."
echo
Expand Down
2 changes: 1 addition & 1 deletion extra/convert-all.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )

for model in "${models[@]}"; do
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
Expand Down
3 changes: 2 additions & 1 deletion models/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |

## Model files for testing purposes

Expand Down
4 changes: 2 additions & 2 deletions models/convert-h5-to-coreml.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
# Ported from models/convert-whisper-to-coreml.py
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args()

if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
raise ValueError("Invalid model name")

pt_target_path = f"models/hf-{args.model_name}.pt"
Expand Down
2 changes: 1 addition & 1 deletion models/convert-pt-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def bytes_to_unicode():
# for backwards compatibility, also check for older hf_transformers format tokenizer files
# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
multilingual = hparams["n_vocab"] == 51865
multilingual = hparams["n_vocab"] >= 51865
tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
tokenizer_type = "tiktoken"
if not tokenizer.is_file():
Expand Down
6 changes: 3 additions & 3 deletions models/convert-whisper-to-coreml.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
x = x.permute(0,2,3,1).squeeze(0)

# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
if self.token_embedding.weight.shape[0] == 51865:
if self.token_embedding.weight.shape[0] >= 51865:
# split in 11 chunks - 4715 each
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
Expand Down Expand Up @@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args()

if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
raise ValueError("Invalid model name")

whisper = load_model(args.model).cpu()
Expand Down
4 changes: 2 additions & 2 deletions models/convert-whisper-to-openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
args = parser.parse_args()

if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
raise ValueError("Invalid model name")

whisper = load_model(args.model).cpu()
Expand Down
2 changes: 1 addition & 1 deletion models/download-coreml-model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ function get_script_path() {
models_path="$(get_script_path)"

# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )

# list available models
function list_models {
Expand Down
6 changes: 3 additions & 3 deletions models/download-ggml-model.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ popd
set argc=0
for %%x in (%*) do set /A argc+=1

set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large

if %argc% neq 1 (
echo.
Expand Down Expand Up @@ -57,8 +57,8 @@ goto :eof
:list_models
echo.
echo Available models:
(for %%a in (%models%) do (
echo %%a
(for %%a in (%models%) do (
echo %%a
))
echo.
exit /b
1 change: 1 addition & 0 deletions models/download-ggml-model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ models=(
"medium-q5_0"
"medium.en-q5_0"
"large-v1"
"large-v2"
"large"
"large-q5_0"
)
Expand Down
2 changes: 1 addition & 1 deletion tests/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
cd `dirname $0`

# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )

# list available models
function list_models {
Expand Down
54 changes: 40 additions & 14 deletions whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,15 @@ enum e_model {
MODEL_LARGE,
};

static const std::map<e_model, std::string> g_model_name = {
{ MODEL_UNKNOWN, "unknown" },
{ MODEL_TINY, "tiny" },
{ MODEL_BASE, "base" },
{ MODEL_SMALL, "small" },
{ MODEL_MEDIUM, "medium" },
{ MODEL_LARGE, "large" },
};

static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "en", { 0, "english", } },
{ "zh", { 1, "chinese", } },
Expand Down Expand Up @@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "ba", { 96, "bashkir", } },
{ "jw", { 97, "javanese", } },
{ "su", { 98, "sundanese", } },
{ "yue", { 99, "cantonese", } },
};

static const size_t MB = 1ull*1024*1024;
Expand Down Expand Up @@ -402,7 +412,11 @@ struct whisper_vocab {
id token_beg = 50363; // begin timestamps

bool is_multilingual() const {
return n_vocab == 51865;
return n_vocab >= 51865;
}

int num_languages() const {
return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
}
};

Expand Down Expand Up @@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

assert(hparams.n_text_state == hparams.n_audio_state);

std::string mver = "";

if (hparams.n_audio_layer == 4) {
model.type = e_model::MODEL_TINY;
}
Expand All @@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

if (hparams.n_audio_layer == 32) {
model.type = e_model::MODEL_LARGE;

if (hparams.n_vocab == 51866) {
mver = " v3";
}
}

const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
Expand Down Expand Up @@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
log("%s: qntvr = %d\n", __func__, qntvr);
log("%s: type = %d\n", __func__, model.type);
log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());

// print memory requirements
{
Expand Down Expand Up @@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;
vocab.token_translate++;
vocab.token_transcribe++;
vocab.token_solm++;
vocab.token_prev++;
vocab.token_nosp++;
vocab.token_not++;
vocab.token_beg++;

// account for variable number of language tokens
const int dt = vocab.num_languages() - 98;

vocab.token_translate += dt;
vocab.token_transcribe += dt;
vocab.token_solm += dt;
vocab.token_prev += dt;
vocab.token_nosp += dt;
vocab.token_not += dt;
vocab.token_beg += dt;
}

if (n_vocab < model.hparams.n_vocab) {
Expand Down Expand Up @@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
vocab.id_to_token[i] = word;
}
}

log("%s: n_langs = %d\n", __func__, vocab.num_languages());
}

size_t ctx_size = 0;
Expand Down Expand Up @@ -3281,7 +3307,7 @@ void whisper_free_params(struct whisper_full_params * params) {
}

int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
Expand All @@ -3295,7 +3321,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int

// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
Expand All @@ -3318,13 +3344,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
// TODO

int whisper_set_mel_with_state(
struct whisper_context * /*ctx*/,
struct whisper_context * ctx,
struct whisper_state * state,
const float * data,
int n_len,
int n_mel) {
if (n_mel != WHISPER_N_MEL) {
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
if (n_mel != ctx->model.filters.n_mel) {
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
return -1;
}

Expand Down
1 change: 0 additions & 1 deletion whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_N_MEL 80
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30

Expand Down
Loading