From 6d6c123b78867167fe85e4cb09d248a952768a38 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 7 Nov 2023 15:30:18 +0200 Subject: [PATCH] whisper : add support for large v3 (#1444) * whisper : add support for large v3 * bench : fix build + fix go bindings * bench : fix n_mels * models : update readme --- Makefile | 3 +- README.md | 3 +- .../go/examples/go-model-download/main.go | 2 +- bindings/go/whisper.go | 1 - examples/bench.wasm/emscripten.cpp | 4 +- examples/bench/bench.cpp | 4 +- examples/livestream.sh | 2 +- examples/twitch.sh | 2 +- extra/convert-all.sh | 2 +- models/README.md | 3 +- models/convert-h5-to-coreml.py | 4 +- models/convert-pt-to-ggml.py | 2 +- models/convert-whisper-to-coreml.py | 6 +-- models/convert-whisper-to-openvino.py | 4 +- models/download-coreml-model.sh | 2 +- models/download-ggml-model.cmd | 6 +-- models/download-ggml-model.sh | 1 + tests/run-tests.sh | 2 +- whisper.cpp | 54 ++++++++++++++----- whisper.h | 1 - 20 files changed, 70 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index e9a97af490d..d134b768bc6 100644 --- a/Makefile +++ b/Makefile @@ -417,9 +417,10 @@ samples: .PHONY: medium.en .PHONY: medium .PHONY: large-v1 +.PHONY: large-v2 .PHONY: large -tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main +tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main bash ./models/download-ggml-model.sh $@ @echo "" @echo "===============================================" diff --git a/README.md b/README.md index 4267f418d16..988785cf92c 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,7 @@ make small make medium.en make medium make large-v1 +make large-v2 make large ``` @@ -245,7 +246,7 @@ make large | base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` | | small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` | | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` | -| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` | +| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` | ## Quantization diff --git a/bindings/go/examples/go-model-download/main.go b/bindings/go/examples/go-model-download/main.go index 67462a581d3..d3e45c28ea3 100644 --- a/bindings/go/examples/go-model-download/main.go +++ b/bindings/go/examples/go-model-download/main.go @@ -24,7 +24,7 @@ const ( var ( // The models which will be downloaded, if no model is specified as an argument - modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"} + modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"} ) var ( diff --git a/bindings/go/whisper.go b/bindings/go/whisper.go index b77e103c4e3..9660662084f 100644 --- a/bindings/go/whisper.go +++ b/bindings/go/whisper.go @@ -83,7 +83,6 @@ const ( SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits NumFFT = C.WHISPER_N_FFT - NumMEL = C.WHISPER_N_MEL HopLength = C.WHISPER_HOP_LENGTH ChunkSize = C.WHISPER_CHUNK_SIZE ) diff --git a/examples/bench.wasm/emscripten.cpp b/examples/bench.wasm/emscripten.cpp index 3624bbc48b1..083397db057 100644 --- a/examples/bench.wasm/emscripten.cpp +++ b/examples/bench.wasm/emscripten.cpp @@ -23,7 +23,9 @@ void bench_main(size_t index) { fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads); - if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) { + const int n_mels = whisper_model_n_mels(ctx); + + if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) { fprintf(stderr, "error: failed to set mel: %d\n", ret); return; } diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 9f50b3b6224..db1c4e800cd 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -73,7 +73,9 @@ int whisper_bench_full(const whisper_params & params) { return 2; } - if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) { + const int n_mels = whisper_model_n_mels(ctx); + + if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) { fprintf(stderr, "error: failed to set mel: %d\n", ret); return 3; } diff --git a/examples/livestream.sh b/examples/livestream.sh index 42d0102fd58..d86a7c601d5 100755 --- a/examples/livestream.sh +++ b/examples/livestream.sh @@ -48,7 +48,7 @@ if [ -n "$3" ]; then fi # Whisper models -models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" ) +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) # list available models function list_models { diff --git a/examples/twitch.sh b/examples/twitch.sh index c185fb24f16..77b618dde9b 100755 --- a/examples/twitch.sh +++ b/examples/twitch.sh @@ -21,7 +21,7 @@ help() echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]" echo "options:" echo "-s Step in seconds (default is $step)." - echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')." + echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')." echo "-t Number of threads to use." echo "-h Print this help page." echo diff --git a/extra/convert-all.sh b/extra/convert-all.sh index c5ba9094d7b..c9638079c92 100755 --- a/extra/convert-all.sh +++ b/extra/convert-all.sh @@ -1,6 +1,6 @@ #!/bin/bash -models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" ) +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) for model in "${models[@]}"; do python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/ diff --git a/models/README.md b/models/README.md index 10446a63d5f..b12f2d22d63 100644 --- a/models/README.md +++ b/models/README.md @@ -50,7 +50,8 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` | | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` | | large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` | -| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` | +| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` | +| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` | ## Model files for testing purposes diff --git a/models/convert-h5-to-coreml.py b/models/convert-h5-to-coreml.py index 93c797ba5b2..3887c22a7e4 100644 --- a/models/convert-h5-to-coreml.py +++ b/models/convert-h5-to-coreml.py @@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str): # Ported from models/convert-whisper-to-coreml.py if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True) + parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True) parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True) parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False) parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False) parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False) args = parser.parse_args() - if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]: + if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]: raise ValueError("Invalid model name") pt_target_path = f"models/hf-{args.model_name}.pt" diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py index 9aa134b53f7..7a3daf238a8 100644 --- a/models/convert-pt-to-ggml.py +++ b/models/convert-pt-to-ggml.py @@ -228,7 +228,7 @@ def bytes_to_unicode(): # for backwards compatibility, also check for older hf_transformers format tokenizer files # old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json # new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken -multilingual = hparams["n_vocab"] == 51865 +multilingual = hparams["n_vocab"] >= 51865 tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken") tokenizer_type = "tiktoken" if not tokenizer.is_file(): diff --git a/models/convert-whisper-to-coreml.py b/models/convert-whisper-to-coreml.py index d4a7805209a..adbbd1099cb 100644 --- a/models/convert-whisper-to-coreml.py +++ b/models/convert-whisper-to-coreml.py @@ -194,7 +194,7 @@ def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): x = x.permute(0,2,3,1).squeeze(0) # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks - if self.token_embedding.weight.shape[0] == 51865: + if self.token_embedding.weight.shape[0] >= 51865: # split in 11 chunks - 4715 each splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0) logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1) @@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True) + parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True) parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False) parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False) parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False) args = parser.parse_args() - if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]: + if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]: raise ValueError("Invalid model name") whisper = load_model(args.model).cpu() diff --git a/models/convert-whisper-to-openvino.py b/models/convert-whisper-to-openvino.py index cdee571b11f..6b3d396643b 100644 --- a/models/convert-whisper-to-openvino.py +++ b/models/convert-whisper-to-openvino.py @@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True) + parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True) args = parser.parse_args() - if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]: + if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]: raise ValueError("Invalid model name") whisper = load_model(args.model).cpu() diff --git a/models/download-coreml-model.sh b/models/download-coreml-model.sh index d46789d7c06..95739dbf995 100755 --- a/models/download-coreml-model.sh +++ b/models/download-coreml-model.sh @@ -19,7 +19,7 @@ function get_script_path() { models_path="$(get_script_path)" # Whisper models -models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" ) +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) # list available models function list_models { diff --git a/models/download-ggml-model.cmd b/models/download-ggml-model.cmd index 9042e99b275..fc279967dac 100644 --- a/models/download-ggml-model.cmd +++ b/models/download-ggml-model.cmd @@ -8,7 +8,7 @@ popd set argc=0 for %%x in (%*) do set /A argc+=1 -set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large +set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large if %argc% neq 1 ( echo. @@ -57,8 +57,8 @@ goto :eof :list_models echo. echo Available models: - (for %%a in (%models%) do ( - echo %%a + (for %%a in (%models%) do ( + echo %%a )) echo. exit /b diff --git a/models/download-ggml-model.sh b/models/download-ggml-model.sh index 288e08d21e9..ea68da8936d 100755 --- a/models/download-ggml-model.sh +++ b/models/download-ggml-model.sh @@ -41,6 +41,7 @@ models=( "medium-q5_0" "medium.en-q5_0" "large-v1" + "large-v2" "large" "large-q5_0" ) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 38fa5cea52b..bf062dd6b8d 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -19,7 +19,7 @@ cd `dirname $0` # Whisper models -models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" ) +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) # list available models function list_models { diff --git a/whisper.cpp b/whisper.cpp index 7f4f69a91d1..b6300d5f03c 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -193,6 +193,15 @@ enum e_model { MODEL_LARGE, }; +static const std::map g_model_name = { + { MODEL_UNKNOWN, "unknown" }, + { MODEL_TINY, "tiny" }, + { MODEL_BASE, "base" }, + { MODEL_SMALL, "small" }, + { MODEL_MEDIUM, "medium" }, + { MODEL_LARGE, "large" }, +}; + static const std::map> g_lang = { { "en", { 0, "english", } }, { "zh", { 1, "chinese", } }, @@ -293,6 +302,7 @@ static const std::map> g_lang = { { "ba", { 96, "bashkir", } }, { "jw", { 97, "javanese", } }, { "su", { 98, "sundanese", } }, + { "yue", { 99, "cantonese", } }, }; static const size_t MB = 1ull*1024*1024; @@ -402,7 +412,11 @@ struct whisper_vocab { id token_beg = 50363; // begin timestamps bool is_multilingual() const { - return n_vocab == 51865; + return n_vocab >= 51865; + } + + int num_languages() const { + return n_vocab - 51765 - (is_multilingual() ? 1 : 0); } }; @@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con assert(hparams.n_text_state == hparams.n_audio_state); + std::string mver = ""; + if (hparams.n_audio_layer == 4) { model.type = e_model::MODEL_TINY; } @@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con if (hparams.n_audio_layer == 32) { model.type = e_model::MODEL_LARGE; + + if (hparams.n_vocab == 51866) { + mver = " v3"; + } } const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; @@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con log("%s: n_mels = %d\n", __func__, hparams.n_mels); log("%s: ftype = %d\n", __func__, model.hparams.ftype); log("%s: qntvr = %d\n", __func__, qntvr); - log("%s: type = %d\n", __func__, model.type); + log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str()); // print memory requirements { @@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con if (vocab.is_multilingual()) { vocab.token_eot++; vocab.token_sot++; - vocab.token_translate++; - vocab.token_transcribe++; - vocab.token_solm++; - vocab.token_prev++; - vocab.token_nosp++; - vocab.token_not++; - vocab.token_beg++; + + // account for variable number of language tokens + const int dt = vocab.num_languages() - 98; + + vocab.token_translate += dt; + vocab.token_transcribe += dt; + vocab.token_solm += dt; + vocab.token_prev += dt; + vocab.token_nosp += dt; + vocab.token_not += dt; + vocab.token_beg += dt; } if (n_vocab < model.hparams.n_vocab) { @@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con vocab.id_to_token[i] = word; } } + + log("%s: n_langs = %d\n", __func__, vocab.num_languages()); } size_t ctx_size = 0; @@ -3281,7 +3307,7 @@ void whisper_free_params(struct whisper_full_params * params) { } int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3295,7 +3321,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3318,13 +3344,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * // TODO int whisper_set_mel_with_state( - struct whisper_context * /*ctx*/, + struct whisper_context * ctx, struct whisper_state * state, const float * data, int n_len, int n_mel) { - if (n_mel != WHISPER_N_MEL) { - log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL); + if (n_mel != ctx->model.filters.n_mel) { + log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel); return -1; } diff --git a/whisper.h b/whisper.h index 300fc4bac37..ed1612b4bc8 100644 --- a/whisper.h +++ b/whisper.h @@ -29,7 +29,6 @@ #define WHISPER_SAMPLE_RATE 16000 #define WHISPER_N_FFT 400 -#define WHISPER_N_MEL 80 #define WHISPER_HOP_LENGTH 160 #define WHISPER_CHUNK_SIZE 30