diff --git a/examples/helpers.js b/examples/helpers.js index 03b3a8576a0..98cc5c63ec4 100644 --- a/examples/helpers.js +++ b/examples/helpers.js @@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) { var db = event.target.result; var tx = db.transaction(['models'], 'readwrite'); var os = tx.objectStore('models'); - var rq = os.put(data, url); + + var rq = null; + try { + var rq = os.put(data, url); + } catch (e) { + cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e); + cbCancel(); + return; + } rq.onsuccess = function (event) { cbPrint('loadRemote: "' + url + '" stored in the IndexedDB'); @@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) { rq.onabort = function (event) { cbPrint('loadRemote: failed to open IndexedDB: abort'); - + cbCancel(); }; } - diff --git a/examples/whisper.wasm/CMakeLists.txt b/examples/whisper.wasm/CMakeLists.txt index 48c31adb586..75e5a8dec00 100644 --- a/examples/whisper.wasm/CMakeLists.txt +++ b/examples/whisper.wasm/CMakeLists.txt @@ -31,9 +31,9 @@ endif() set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ --bind \ -s USE_PTHREADS=1 \ - -s PTHREAD_POOL_SIZE=8 \ - -s INITIAL_MEMORY=1500MB \ - -s TOTAL_MEMORY=1500MB \ + -s PTHREAD_POOL_SIZE_STRICT=0 \ + -s INITIAL_MEMORY=2000MB \ + -s TOTAL_MEMORY=2000MB \ -s FORCE_FILESYSTEM=1 \ -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ ${EXTRA_FLAGS} \ diff --git a/examples/whisper.wasm/emscripten.cpp b/examples/whisper.wasm/emscripten.cpp index f92d814e0fc..db1ff789e5f 100644 --- a/examples/whisper.wasm/emscripten.cpp +++ b/examples/whisper.wasm/emscripten.cpp @@ -10,6 +10,12 @@ std::thread g_worker; std::vector g_contexts(4, nullptr); +static inline int mpow2(int n) { + int p = 1; + while (p <= n) p *= 2; + return p/2; +} + EMSCRIPTEN_BINDINGS(whisper) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { if (g_worker.joinable()) { @@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) { } })); - emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) { + emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) { if (g_worker.joinable()) { g_worker.join(); } @@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) { params.print_special = false; params.translate = translate; params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en"; - params.n_threads = std::min(8, (int) std::thread::hardware_concurrency()); + params.n_threads = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency()))); params.offset_ms = 0; std::vector pcmf32; diff --git a/examples/whisper.wasm/index-tmpl.html b/examples/whisper.wasm/index-tmpl.html index 358ed70fbb9..d399beb7c04 100644 --- a/examples/whisper.wasm/index-tmpl.html +++ b/examples/whisper.wasm/index-tmpl.html @@ -40,21 +40,34 @@ Note that the computation is quite heavy and may take a few seconds to complete.
The transcription results will be displayed in the text area below.

- Important: your browser must support WASM SIMD instructions for this to work. + Important: + -


+
- Whisper model: + Whisper models:

- - +

+ Quantized models:

+ + + +
+ + + +

@@ -161,6 +174,12 @@ + + + Threads: + + 8 + @@ -263,11 +282,13 @@ Module.FS_createDataFile("/", fname, buf, true, true); - model_whisper = fname; + //model_whisper = fname; document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!'; printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length); + + document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper; } function loadFile(event, fname) { @@ -292,6 +313,15 @@ document.getElementById('fetch-whisper-tiny' ).style.display = 'none'; document.getElementById('fetch-whisper-base' ).style.display = 'none'; document.getElementById('fetch-whisper-small' ).style.display = 'none'; + + document.getElementById('fetch-whisper-base-en-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-base-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-small-en-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-small-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-medium-en-q4_0').style.display = 'none'; + document.getElementById('fetch-whisper-medium-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-large-q4_0' ).style.display = 'none'; + document.getElementById('whisper-file' ).style.display = 'none'; document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name; } @@ -304,6 +334,14 @@ 'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin', 'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin', 'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin', + + 'base-en-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q4_0.bin', + 'base-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-base-q4_0.bin', + 'small-en-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q4_0.bin', + 'small-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-small-q4_0.bin', + 'medium-en-q4_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q4_0.bin', + 'medium-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-medium-q4_0.bin', + 'large-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q4_0.bin', }; let sizes = { @@ -313,6 +351,14 @@ 'base': 142, 'small.en': 466, 'small': 466, + + 'base-en-q4_0': 49, + 'base-q4_0': 49, + 'small-en-q4_0': 152, + 'small-q4_0': 152, + 'medium-en-q4_0': 469, + 'medium-q4_0': 469, + 'large-q4_0': 985, }; let url = urls[model]; @@ -327,6 +373,15 @@ document.getElementById('fetch-whisper-tiny' ).style.display = 'none'; document.getElementById('fetch-whisper-base' ).style.display = 'none'; document.getElementById('fetch-whisper-small' ).style.display = 'none'; + + document.getElementById('fetch-whisper-base-en-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-base-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-small-en-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-small-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-medium-en-q4_0').style.display = 'none'; + document.getElementById('fetch-whisper-medium-q4_0' ).style.display = 'none'; + document.getElementById('fetch-whisper-large-q4_0' ).style.display = 'none'; + document.getElementById('whisper-file' ).style.display = 'none'; document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model; @@ -337,12 +392,22 @@ cbCancel = function() { var el; + el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block'; + + el = document.getElementById('fetch-whisper-base-en-q4_0' ); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-base-q4_0' ); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-small-en-q4_0' ); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-small-q4_0' ); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-medium-en-q4_0'); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-medium-q4_0' ); if (el) el.style.display = 'inline-block'; + el = document.getElementById('fetch-whisper-large-q4_0' ); if (el) el.style.display = 'inline-block'; + el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = ''; }; @@ -354,7 +419,8 @@ // audio file // - const kMaxAudio_s = 120; + const kMaxAudio_s = 30*60; + const kMaxRecording_s = 2*60; const kSampleRate = 16000; window.AudioContext = window.AudioContext || window.webkitAudioContext; @@ -423,7 +489,7 @@ doRecording = false; } - // record up to kMaxAudio_s seconds of audio from the microphone + // record up to kMaxRecording_s seconds of audio from the microphone // check if doRecording is false every 1000 ms and stop recording if so // update progress information function startRecording() { @@ -479,9 +545,9 @@ printTextarea('js: audio recorded, size: ' + audio.length); // truncate to first 30 seconds - if (audio.length > kMaxAudio_s*kSampleRate) { - audio = audio.slice(0, kMaxAudio_s*kSampleRate); - printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds'); + if (audio.length > kMaxRecording_s*kSampleRate) { + audio = audio.slice(0, kMaxRecording_s*kSampleRate); + printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds'); } setAudio(audio); }); @@ -509,24 +575,31 @@ }); } - document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%'; - document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%'; + document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%'; + document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%'; }, 1000); printTextarea('js: recording ...'); setTimeout(function() { if (doRecording) { - printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds'); + printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds'); stopRecording(); } - }, kMaxAudio_s*1000); + }, kMaxRecording_s*1000); } // // transcribe // + var nthreads = 8; + + function changeThreads(value) { + nthreads = value; + document.getElementById('threads-value').innerHTML = nthreads; + } + function onProcess(translate) { if (!instance) { instance = Module.init('whisper.bin'); @@ -553,7 +626,7 @@ printTextarea(''); setTimeout(function() { - var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate); + var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate); console.log('js: full_default returned: ' + ret); if (ret) { printTextarea("js: whisper returned: " + ret); diff --git a/whisper.cpp b/whisper.cpp index 2c489b92466..7e01e0e0691 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -255,12 +255,34 @@ static const std::map MEM_REQ_SCRATCH3 = { { MODEL_LARGE, 9ull*MB }, }; -static const std::map MEM_REQ_MODEL = { - { MODEL_TINY, 74ull*MB }, - { MODEL_BASE, 142ull*MB }, - { MODEL_SMALL, 466ull*MB }, - { MODEL_MEDIUM, 1464ull*MB }, - { MODEL_LARGE, 2952ull*MB }, +static const std::map> MEM_REQ_MODEL = { + { GGML_TYPE_F16, + { + { MODEL_TINY, 74ull*MB }, + { MODEL_BASE, 142ull*MB }, + { MODEL_SMALL, 466ull*MB }, + { MODEL_MEDIUM, 1464ull*MB }, + { MODEL_LARGE, 2952ull*MB }, + }, + }, + { GGML_TYPE_Q4_0, + { + { MODEL_TINY, 26ull*MB }, + { MODEL_BASE, 50ull*MB }, + { MODEL_SMALL, 154ull*MB }, + { MODEL_MEDIUM, 470ull*MB }, + { MODEL_LARGE, 940ull*MB }, + }, + }, + { GGML_TYPE_Q4_1, + { + { MODEL_TINY, 31ull*MB }, + { MODEL_BASE, 57ull*MB }, + { MODEL_SMALL, 181ull*MB }, + { MODEL_MEDIUM, 559ull*MB }, + { MODEL_LARGE, 1122ull*MB }, + }, + }, }; static const std::map MEM_REQ_KV_SELF = { @@ -697,7 +719,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) { const ggml_type wtype = cache.k->type; WHISPER_ASSERT(wtype == cache.v->type); - WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype)); + WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype)); struct ggml_init_params params = { /*.mem_size =*/ cache.buf.size(), @@ -794,12 +816,25 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con model.type = e_model::MODEL_LARGE; } - // for the big tensors, we have the option to store the data in 16-bit floats + // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; + wctx.wtype = GGML_TYPE_COUNT; + switch (model.hparams.f16) { + case 0: wctx.wtype = GGML_TYPE_F32; break; + case 1: wctx.wtype = GGML_TYPE_F16; break; + case 2: wctx.wtype = GGML_TYPE_Q4_0; break; + case 3: wctx.wtype = GGML_TYPE_Q4_1; break; + default: + { + fprintf(stderr, "%s: invalid model (bad f16 value %d)\n", __func__, model.hparams.f16); + return false; + } + } const size_t scale = model.hparams.f16 ? 1 : 2; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state); @@ -810,7 +845,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head); fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: ftype = %s\n", __func__, ftype_str[model.hparams.f16]); fprintf(stderr, "%s: type = %d\n", __func__, model.type); // print memory requirements @@ -821,7 +856,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con MEM_REQ_SCRATCH1.at (model.type) + MEM_REQ_SCRATCH2.at (model.type) + MEM_REQ_SCRATCH3.at (model.type) + - scale*MEM_REQ_MODEL.at (model.type) + + scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) + scale*MEM_REQ_KV_CROSS.at(model.type) + scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)); @@ -837,7 +872,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // always have at least one decoder wctx.model.buf = new std::vector(); - wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type)); + wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type)); // we skip initialization of the state until it is needed // because it might be that state will always be provided externally. @@ -946,92 +981,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // encoder { - ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe; + ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe; - ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype); // e_conv_1_w - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b + ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(GGML_TYPE_F16); // e_conv_1_w + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b - ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype); // e_conv_2_w - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b + ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(GGML_TYPE_F16); // e_conv_2_w + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w; - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b; + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w; + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b; } // decoder { - ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe; + ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe; - ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te; + ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te; - ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w; - ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b; + ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w; + ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b; } // encoder layers { - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b - ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_0_w - ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w + ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b - ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_1_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_q_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_v_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_ln_1_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b } // decoder layers { - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b - ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_0_w - ctx_size += n_text_layer*( 4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w + ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b - ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_1_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_q_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_v_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_ln_1_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b // - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_q_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_v_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_ln_1_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b } ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead @@ -1079,10 +1114,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con { model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx); - model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype, 3, n_mels, n_audio_state); + model.e_conv_1_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, n_mels, n_audio_state); model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); - model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype, 3, n_audio_state, n_audio_state); + model.e_conv_2_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, n_audio_state, n_audio_state); model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); @@ -1298,9 +1333,21 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con return false; } - const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); + size_t bpe = 0; + + switch (ftype) { + case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; + case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; + case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; + case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + default: + { + fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); + return false; + } + }; - if (nelements*bpe != ggml_nbytes(tensor)) { + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); return false; @@ -4834,23 +4881,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*256); + // put a bunch of random data in the buffer for (size_t i = 0; i < buf.size(); i++) buf[i] = i; for (int j = 0; j < (int) sizes.size(); j++) { + int n_q4_0 = 0; + int n_q4_1 = 0; int n_fp16 = 0; int n_fp32 = 0; // GFLOPS/s + double s_q4_0 = 0.0; + double s_q4_1 = 0.0; double s_fp16 = 0.0; double s_fp32 = 0.0; const size_t N = sizes[j]; - for (int k = 0; k < 2; ++k) { - const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; + for (int k = 0; k < 4; ++k) { + const ggml_type wtype = + k == 0 ? GGML_TYPE_Q4_0 : + k == 1 ? GGML_TYPE_Q4_1 : + k == 2 ? GGML_TYPE_F16 : + GGML_TYPE_F32; - double & s = k == 0 ? s_fp16 : s_fp32; - int & n = k == 0 ? n_fp16 : n_fp32; + double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32; + int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32; struct ggml_init_params gparams = { /*.mem_size =*/ buf.size(), @@ -4894,8 +4950,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { s = ((2.0*N*N*N*n)/tsum)*1e-9; } - snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", - N, N, s_fp16, n_fp16, s_fp32, n_fp32); + snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32); s += strbuf; }