diff --git a/README.md b/README.md
index c4872eb12..07b537165 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,9 @@ KanTV("Kan", aka Chinese PinYin "Kan" or Chinese HanZi "看" or English "watch/l
- Record online TV to automatically generate videos (useful for short video creators to generate short video materials but pls respect IPR of original content creator/provider); record online TV's video / audio content for gather video / audio data which might be required of/useful for AI R&D activity
-- ASR(Automatic Speech Recognition, a sub-filed of AI) research by the great whisper.cpp
+- ASR(Automatic Speech Recognition, a subfiled of AI) research by the great whisper.cpp
-- LLM(Large Language Model, a sub-filed of AI) research by the great llama.cpp
+- LLM(Large Language Model, a subfiled of AI) research by the great llama.cpp
- Real-time English subtitle for English online-TV(aka OTT TV) by the great & excellent & amazing whisper.cpp (PoC finished on Xiaomi 14. Xiaomi 14 or other powerful Android mobile phone is HIGHLY required/recommended for real-time subtitle feature otherwise unexpected behavior would happen)
@@ -27,7 +27,7 @@ Some goals of this project are:
- Well-maintained "workbench" for LLM(Large Language Model) researchers who was interested in practise state-of-the-art AI tech(like [llama.cpp](https://github.com/ggerganov/llama.cpp)) in real scenario on mobile device(Android)
-- Android turn-key project for AI experts(whom mightbe not familiar with regular Android software development) focus on AI research activity, part of AI R&D activity(algorithm improvement, model training, model generation, algorithm validation, model validation, performance benchmark......) could be done by Android Studio IDE + a powerful Android phone very easily
+- Android turn-key project for AI experts/researchers(whom mightbe not familiar with regular Android software development) focus on device-side AI R&D activity, part of AI R&D activity(algorithm improvement, model training, model generation, algorithm validation, model validation, performance benchmark......) could be done by Android Studio IDE + a powerful Android phone very easily
### How to build project
@@ -150,24 +150,38 @@ autocmd InsertEnter * match ForbiddenWhitespace /\t\|\s\+\%#\@build/envsetup.sh accordingly before launch build
+ - download android-ndk-r26c to prebuilts/toolchain, skip this step if android-ndk-r26c is already exist
-pay attention here and modify it accordingly if build-target is kantv-android and running Android device is NOT Xiaomi 14
+```
-(TIP: a VERY powerful Linux PC / Linux workstation is HIGHLY recommended for this step)
+./build/prebuild-download.sh
```
+
+ - modify build/envsetup.sh accordingly before launch build
+
+ - moidfy whispercpp/CMakeLists.txt accordingly if build-target is kantv-android and running Android device is NOT Xiaomi 14
+
+
+#### Build native codes
+
+```
+
. build/envsetup.sh
-(download android-ndk-r26c to prebuilts/toolchain, skip this step if android-ndk-r26c is already exist)
-./build/prebuild-download.sh
```
![Screenshot from 2024-03-21 21-41-41](https://github.com/zhouwg/kantv/assets/6889919/3e13946f-596b-44be-9716-5793ce0c7263)
@@ -184,7 +198,7 @@ pay attention i
TextView _txtLLMInfo;
TextView _txtGGMLInfo;
TextView _txtGGMLStatus;
+ EditText _txtUserInput;
+ Button _btnInference;
- Button _btnBenchmark;
-
-
- private int nThreadCounts = 1;
+ private int nThreadCounts = 8;
private int benchmarkIndex = 0;
- private String strModeName = "tiny";
private long beginTime = 0;
private long endTime = 0;
private long duration = 0;
private String strBenchmarkInfo;
+ private String strUserInput = "how many days in this month?";
private AtomicBoolean isBenchmarking = new AtomicBoolean(false);
private ProgressDialog mProgressDialog;
+ // https://huggingface.co/TheBloke/Llama-2-7B-GGUF
+ // https://huggingface.co/TheBloke/Llama-2-13B-GGUF
+ // https://huggingface.co/TheBloke/Llama-2-70B-GGUF
+
// https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF
- // https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf
+ // https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF
+ // https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGUF
+
+ //private String ggmlModelFileName = "llama-2-7b.Q4_K_M.gguf"; //4.08 GB
private String ggmlModelFileName = "llama-2-7b-chat.Q4_K_M.gguf"; //4.08 GB
private Context mContext;
private Activity mActivity;
private Settings mSettings;
+ private KANTVMgr mKANTVMgr = null;
+ private LLMResearchFragment.MyEventListener mEventListener = new LLMResearchFragment.MyEventListener();
public static LLMResearchFragment newInstance() {
return new LLMResearchFragment();
@@ -138,24 +153,137 @@ public void initView() {
_txtGGMLInfo = (TextView) mActivity.findViewById(R.id.ggmlInfoLLM);
_txtGGMLStatus = (TextView) mActivity.findViewById(R.id.ggmlStatusLLM);
+ //TODO: change to voice input, and then use whisper.cpp to convert it into text
+ _txtUserInput = (EditText) mActivity.findViewById(R.id.txtUserInput);
+
+ _btnInference = (Button) mActivity.findViewById(R.id.btnInference);
+
_txtLLMInfo.setCompoundDrawablesWithIntrinsicBounds(null, null, null, null);
+ _txtLLMInfo.setMovementMethod(ScrollingMovementMethod.getInstance());
displayFileStatus(CDEUtils.getDataPath() + ggmlModelFileName);
- CDELog.j(TAG, "load LLM model");
+ try {
+ CDELibraryLoader.load("whispercpp");
+ CDELog.j(TAG, "cpu core counts:" + whispercpp.get_cpu_core_counts());
+ } catch (Exception e) {
+ CDELog.j(TAG, "failed to initialize GGML jni");
+ return;
+ }
+
+ try {
+ initKANTVMgr();
+ } catch (Exception e) {
+ CDELog.j(TAG, "failed to initialize asr subsystem");
+ return;
+ }
+
+ CDELog.j(TAG, "load ggml's LLM model");
+ String systemInfo = whispercpp.llm_get_systeminfo();
String phoneInfo = "Device info:" + "\n"
+ "Brand:" + Build.BRAND + "\n"
+ "Hardware:" + Build.HARDWARE + "\n"
+ "OS:" + "Android " + android.os.Build.VERSION.RELEASE + "\n"
- + "Arch:" + Build.CPU_ABI ;
+ + "Arch:" + Build.CPU_ABI + "(" + systemInfo + ")";
_txtGGMLInfo.setText("");
_txtGGMLInfo.append(phoneInfo + "\n");
_txtGGMLInfo.append("Powered by llama.cpp(https://github.com/ggerganov/llama.cpp)\n");
+ _btnInference.setOnClickListener(v -> {
+ String strPrompt = _txtUserInput.getText().toString();
+ if (strPrompt.isEmpty()) {
+ //CDEUtils.showMsgBox(mActivity, "pls check your input");
+ //return;
+ //just for test
+ strPrompt = strUserInput;
+ }
+ strPrompt = strPrompt.trim();
+ strUserInput = strPrompt;
+ CDELog.j(TAG, "User input: \n " + strUserInput);
+
+ CDELog.j(TAG, "strModeName:" + ggmlModelFileName);
+
+ String selectModeFileName = ggmlModelFileName;
+ String selectModelFilePath = CDEUtils.getDataPath() + selectModeFileName;
+ CDELog.j(TAG, "selectModelFilePath:" + selectModelFilePath);
+ File selectModeFile = new File(selectModelFilePath);
+ displayFileStatus(selectModelFilePath);
+ if (!selectModeFile.exists()) {
+ CDELog.j(TAG, "model file not exist:" + selectModeFile.getAbsolutePath());
+ }
+
+ if (!selectModeFile.exists()) {
+ CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file exist in /sdcard/kantv/");
+ return;
+ }
+ ggmlModelFileName = selectModeFileName;
+ CDELog.j(TAG, "model file:" + CDEUtils.getDataPath() + selectModeFileName);
+
+ isBenchmarking.set(true);
+
+ Toast.makeText(mContext, mContext.getString(R.string.ggml_benchmark_start), Toast.LENGTH_LONG).show();
+
+ _txtLLMInfo.setText("");
+ _btnInference.setEnabled(false);
+
+ WindowManager.LayoutParams attributes = mActivity.getWindow().getAttributes();
+ attributes.screenBrightness = 1.0f;
+ mActivity.getWindow().setAttributes(attributes);
+ mActivity.getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+ launchGGMLBenchmarkThread();
+
+ });
endTime = System.currentTimeMillis();
CDELog.j(TAG, "initView cost: " + (endTime - beginTime) + " milliseconds");
}
+ private final void launchGGMLBenchmarkThread() {
+ Thread workThread = new Thread(new Runnable() {
+ @RequiresApi(api = Build.VERSION_CODES.O)
+ @Override
+ public void run() {
+ strBenchmarkInfo = "";
+
+ while (isBenchmarking.get()) {
+ beginTime = System.currentTimeMillis();
+ _txtGGMLStatus.setText("LLAMA inference is progressing...");
+ strBenchmarkInfo = whispercpp.llm_bench(
+ CDEUtils.getDataPath() + ggmlModelFileName,
+ strUserInput,
+ benchmarkIndex,
+ nThreadCounts);
+ endTime = System.currentTimeMillis();
+ duration = (endTime - beginTime);
+ isBenchmarking.set(false);
+
+ mActivity.runOnUiThread(new Runnable() {
+ @Override
+ public void run() {
+ String benchmarkTip = "LLAMA inference " + "(model: " + ggmlModelFileName
+ + " ,threads: " + nThreadCounts
+ + " ) cost " + duration + " milliseconds";
+ benchmarkTip += "\n";
+
+ if (!strBenchmarkInfo.startsWith("unknown")) {
+ benchmarkTip += strBenchmarkInfo;
+ }
+
+ CDELog.j(TAG, benchmarkTip);
+ _txtGGMLStatus.append(benchmarkTip);
+
+ _btnInference.setEnabled(true);
+ }
+ });
+ }
+
+
+ }
+ });
+ workThread.start();
+
+ }
+
@Override
public void initListener() {
@@ -178,7 +306,6 @@ public void onStop() {
}
-
private void displayFileStatus(String modelFilePath) {
_txtGGMLStatus.setText("");
File modelFile = new File(modelFilePath);
@@ -189,4 +316,85 @@ private void displayFileStatus(String modelFilePath) {
_txtGGMLStatus.append("model file not exist: " + modelFile.getAbsolutePath());
}
}
+
+ protected class MyEventListener implements KANTVEventListener {
+
+ MyEventListener() {
+ }
+
+
+ @Override
+ public void onEvent(KANTVEventType eventType, int what, int arg1, int arg2, Object obj) {
+ String eventString = "got event from native layer: " + eventType.toString() + " (" + what + ":" + arg1 + " ) :" + (String) obj;
+ String content = (String) obj;
+
+ if (eventType.getValue() == KANTVEvent.KANTV_ERROR) {
+ CDELog.j(TAG, "ERROR:" + eventString);
+ _txtLLMInfo.setText("ERROR:" + content);
+ }
+
+ if (eventType.getValue() == KANTVEvent.KANTV_INFO) {
+ if ((arg1 == KANTV_INFO_ASR_STOP)
+ || (arg1 == KANTV_INFO_ASR_FINALIZE)
+ ) {
+ return;
+ }
+
+ //CDELog.j(TAG, "content:" + content);
+ if (content.startsWith("unknown")) {
+
+ } else {
+ if (content.startsWith("llama-timings")) {
+ _txtGGMLStatus.setText("");
+ _txtGGMLStatus.append(content);
+ } else {
+ _txtLLMInfo.append(content);
+ }
+ }
+ }
+ }
+ }
+
+
+ private void initKANTVMgr() {
+ if (mKANTVMgr != null) {
+ return;
+ }
+
+ try {
+ mKANTVMgr = new KANTVMgr(mEventListener);
+ if (mKANTVMgr != null) {
+ mKANTVMgr.initASR();
+ mKANTVMgr.startASR();
+ }
+ CDELog.j(TAG, "KANTVMgr version:" + mKANTVMgr.getMgrVersion());
+ } catch (KANTVException ex) {
+ String errorMsg = "An exception was thrown because:\n" + " " + ex.getMessage();
+ CDELog.j(TAG, "error occurred: " + errorMsg);
+ CDEUtils.showMsgBox(mActivity, errorMsg);
+ ex.printStackTrace();
+ }
+ }
+
+
+ public void release() {
+ if (mKANTVMgr == null) {
+ return;
+ }
+
+ try {
+ CDELog.j(TAG, "release");
+ {
+ mKANTVMgr.finalizeASR();
+ mKANTVMgr.stopASR();
+ mKANTVMgr.release();
+ mKANTVMgr = null;
+ }
+ } catch (Exception ex) {
+ String errorMsg = "An exception was thrown because:\n" + " " + ex.getMessage();
+ CDELog.j(TAG, "error occurred: " + errorMsg);
+ ex.printStackTrace();
+ }
+ }
+
}
diff --git a/cdeosplayer/kantv/src/main/res/drawable/llamacpp_logo.png b/cdeosplayer/kantv/src/main/res/drawable/llamacpp_logo.png
index 1b6946be9..62eb4bead 100644
Binary files a/cdeosplayer/kantv/src/main/res/drawable/llamacpp_logo.png and b/cdeosplayer/kantv/src/main/res/drawable/llamacpp_logo.png differ
diff --git a/cdeosplayer/kantv/src/main/res/drawable/textview_border.xml b/cdeosplayer/kantv/src/main/res/drawable/textview_border.xml
new file mode 100644
index 000000000..9ba457ede
--- /dev/null
+++ b/cdeosplayer/kantv/src/main/res/drawable/textview_border.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
index d27501a50..64dfd75ac 100755
--- a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
+++ b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
@@ -31,7 +31,7 @@
+ android:layout_height="210dp" />
+ android:layout_height="210dp" />
-
-
+
+
+
+
diff --git a/external/.gitignore b/external/.gitignore
index fc98eb947..42292a82e 100644
--- a/external/.gitignore
+++ b/external/.gitignore
@@ -2,7 +2,6 @@ DeepSpeech/
gstreamer/
ncnn/
CLBlast/
-llamacpp/
*.a
*.so
diff --git a/external/whispercpp/CMakeLists.txt b/external/whispercpp/CMakeLists.txt
index d0af8d823..370c47198 100644
--- a/external/whispercpp/CMakeLists.txt
+++ b/external/whispercpp/CMakeLists.txt
@@ -5,7 +5,7 @@
# Description: build libwhispercpp.so for target Android
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.22.1) # make llamacpp happy
project(whispercpp)
set(CMAKE_VERBOSE_MAKEFILE on)
@@ -21,11 +21,19 @@ set(SOURCE_FILES
${WHISPERCPP_SRC_DIR}/ggml-alloc.c
${WHISPERCPP_SRC_DIR}/ggml-backend.c
${WHISPERCPP_SRC_DIR}/ggml-quants.c
+
+
${WHISPERCPP_SRC_DIR}/whisper.cpp
+ ${LLAMACPP_SRC_DIR}/llama.cpp
+ ${LLAMACPP_SRC_DIR}/unicode.cpp
+
${WHISPERCPP_SRC_DIR}/jni/tinywav.c
${WHISPERCPP_SRC_DIR}/jni/whispercpp-jni.c
${WHISPERCPP_SRC_DIR}/jni/whispercpp-jni-impl.cpp
+ ${WHISPERCPP_SRC_DIR}/jni/sampling.cpp
+ ${WHISPERCPP_SRC_DIR}/jni/common.cpp
+ ${WHISPERCPP_SRC_DIR}/jni/grammar-parser.cpp
)
@@ -42,6 +50,9 @@ include_directories(${WHISPERCPP_SRC_DIR}/)
include_directories(${WHISPERCPP_SRC_DIR}/jni)
include_directories(${PREBUIT_INC_PATH}/)
+#re-use the ggml.h in subdirectory llamacpp/ggml.h to avoid NDK complain "error: redefinition of 'ggml_status'"
+include_directories(${LLAMACPP_SRC_DIR}/)
+
add_definitions(-DTARGET_ANDROID)
add_definitions(-D__ARM_NEON)
diff --git a/external/whispercpp/ggml-alloc.c b/external/whispercpp/ggml-alloc.c
index 60b86c275..643b2e55f 100644
--- a/external/whispercpp/ggml-alloc.c
+++ b/external/whispercpp/ggml-alloc.c
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- if (ggml_is_view(node)) {
+ // TODO: better way to add external dependencies
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
+ // itself is never used and should not be considered a dependency
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
- // allocate explicit inputs and leafs
- if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
+ // allocate explicit inputs
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
}
}
@@ -701,13 +705,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
- if (leaf->view_src || leaf->data) {
- galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
- galloc->leaf_allocs[i].leaf.size_max = 0;
- } else {
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
- }
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
}
// reallocate buffers if needed
diff --git a/external/whispercpp/ggml-backend-impl.h b/external/whispercpp/ggml-backend-impl.h
index e475e20e5..f121e1de4 100644
--- a/external/whispercpp/ggml-backend-impl.h
+++ b/external/whispercpp/ggml-backend-impl.h
@@ -103,6 +103,11 @@ extern "C" {
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
+ // even if the weight has to be copied from the CPU temporarily
+ bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+
// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
diff --git a/external/whispercpp/ggml-backend.c b/external/whispercpp/ggml-backend.c
index 31f8d5a6d..402d86ef3 100644
--- a/external/whispercpp/ggml-backend.c
+++ b/external/whispercpp/ggml-backend.c
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
return err;
}
-bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
}
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
return backend->iface.supports_op(backend, op);
}
+bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+ if (backend->iface.offload_op != NULL) {
+ return backend->iface.offload_op(backend, op);
+ }
+ return false;
+}
+
// backend copy
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -413,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
// add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
ggml_backend_cuda_reg_devices();
#endif
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+ if (cpu_plan->cplan.work_data == NULL) {
+ free(cpu_plan);
+ return NULL;
+ }
}
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
+ /* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif
#ifndef GGML_SCHED_MAX_SPLITS
-#define GGML_SCHED_MAX_SPLITS 256
+#define GGML_SCHED_MAX_SPLITS 2048
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS 16
+#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
#endif
#ifndef GGML_SCHED_MAX_COPIES
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
struct ggml_cgraph * graph;
// graph splits
- struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
+ struct ggml_backend_sched_split * splits;
int n_splits;
+ int splits_capacity;
// pipeline parallelism support
int n_copies;
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend
- // dst
- int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
- if (cur_backend != -1) {
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
+ if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.dst");
- return cur_backend;
+ return cur_backend_id;
}
// view_src
if (tensor->view_src != NULL) {
- cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
- if (cur_backend != -1) {
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
+ if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc");
- return cur_backend;
+ return cur_backend_id;
}
}
- // input
+ // graph input
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
- cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
SET_CAUSE(tensor, "1.inp");
- return cur_backend;
+ return cur_backend_id;
}
// assign nodes that use weights to the backend of the weights
+ // operations with weights are preferably run on the same backend as the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i];
if (src == NULL) {
continue;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
- int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
- // operations with weights are always run on the same backend as the weights
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
+ // check if a backend with higher prio wants to offload the op
+ if (src_backend_id == sched->n_backends - 1) {
+ for (int b = 0; b < src_backend_id; b++) {
+ if (ggml_backend_offload_op(sched->backends[b], tensor)) {
+ SET_CAUSE(tensor, "1.off");
+ return b;
+ }
+ }
+ }
SET_CAUSE(tensor, "1.wgt%d", i);
- return src_backend;
+ return src_backend_id;
}
}
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
- if (tensor_backend_id(leaf) != -1) {
+ int * leaf_backend_id = &tensor_backend_id(leaf);
+ if (*leaf_backend_id != -1) {
// do not overwrite user assignments
continue;
}
- tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- if (tensor_backend_id(node) != -1) {
+ int * node_backend_id = &tensor_backend_id(node);
+ if (*node_backend_id != -1) {
// do not overwrite user assignments
continue;
}
- tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
// src
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
- if (tensor_backend_id(src) == -1) {
- tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
+ int * src_backend_id = &tensor_backend_id(src);
+ if (*src_backend_id == -1) {
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
}
}
}
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
- int tensor_backend_id = tensor_backend_id(node);
- if (tensor_backend_id != -1) {
- if (tensor_backend_id == sched->n_backends - 1) {
+ int * node_backend_id = &tensor_backend_id(node);
+ if (*node_backend_id != -1) {
+ if (*node_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
} else {
- cur_backend_id = tensor_backend_id;
+ cur_backend_id = *node_backend_id;
}
} else {
- tensor_backend_id(node) = cur_backend_id;
+ *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.2");
}
}
}
-
// pass 2.1 expand gpu up
{
int cur_backend_id = -1;
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
- int tensor_backend_id = tensor_backend_id(node);
- if (tensor_backend_id != -1) {
- if (tensor_backend_id == sched->n_backends - 1) {
+ int * node_backend_id = &tensor_backend_id(node);
+ if (*node_backend_id != -1) {
+ if (*node_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
} else {
- cur_backend_id = tensor_backend_id;
+ cur_backend_id = *node_backend_id;
}
} else {
- tensor_backend_id(node) = cur_backend_id;
+ *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.1");
}
}
}
-
-
// pass 2.4 expand rest down
{
int cur_backend_id = -1;
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
- int tensor_backend_id = tensor_backend_id(node);
- if (tensor_backend_id != -1) {
- cur_backend_id = tensor_backend_id;
+ int * node_backend_id = &tensor_backend_id(node);
+ if (*node_backend_id != -1) {
+ cur_backend_id = *node_backend_id;
} else {
- tensor_backend_id(node) = cur_backend_id;
+ *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.4");
}
}
}
- // pass 2.3 expand rest up
+ // pass 2.3 expand rest up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
- int tensor_backend_id = tensor_backend_id(node);
- if (tensor_backend_id != -1) {
- cur_backend_id = tensor_backend_id;
+ int * node_backend_id = &tensor_backend_id(node);
+ if (*node_backend_id != -1) {
+ cur_backend_id = *node_backend_id;
} else {
- tensor_backend_id(node) = cur_backend_id;
+ *node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.3");
}
}
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- int cur_backend_id = tensor_backend_id(node);
- if (node->view_src != NULL && cur_backend_id == -1) {
- cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
+ int * cur_backend_id = &tensor_backend_id(node);
+ if (node->view_src != NULL && *cur_backend_id == -1) {
+ *cur_backend_id = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src == NULL) {
continue;
}
- int src_backend_id = tensor_backend_id(src);
- if (src_backend_id == -1) {
+ int * src_backend_id = &tensor_backend_id(src);
+ if (*src_backend_id == -1) {
if (src->view_src != NULL) {
// views are always on the same backend as the source
- tensor_backend_id(src) = tensor_backend_id(src->view_src);
+ *src_backend_id = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc");
} else {
- tensor_backend_id(src) = cur_backend_id;
+ *src_backend_id = *cur_backend_id;
SET_CAUSE(src, "3.cur");
}
}
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 4: split graph, find tensors that need to be copied
{
- int cur_split = 0;
+ int i_split = 0;
+ struct ggml_backend_sched_split * split = &sched->splits[0];
// find the backend of the first split, skipping view ops
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) {
- sched->splits[0].backend_id = tensor_backend_id(node);
+ split->backend_id = tensor_backend_id(node);
break;
}
}
- sched->splits[0].i_start = 0;
- sched->splits[0].n_inputs = 0;
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
- int cur_backend_id = sched->splits[0].backend_id;
+ split->i_start = 0;
+ split->n_inputs = 0;
+ memset(split->inputs, 0, sizeof(split->inputs)); //HACK
+ int cur_backend_id = split->backend_id;
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue;
}
- int tensor_backend_id = tensor_backend_id(node);
+ const int node_backend_id = tensor_backend_id(node);
- GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
- if (tensor_backend_id != cur_backend_id) {
- sched->splits[cur_split].i_end = i;
- cur_split++;
- GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
- sched->splits[cur_split].backend_id = tensor_backend_id;
- sched->splits[cur_split].i_start = i;
- sched->splits[cur_split].n_inputs = 0;
- cur_backend_id = tensor_backend_id;
+ // check if we should start a new split based on the sources of the current node
+ bool need_new_split = false;
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ continue;
+ }
+ // check if a weight is on a different backend
+ // by starting a new split, the memory of the previously offloaded weights can be reused
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+ int src_backend_id = tensor_backend_id(src);
+ if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
+ need_new_split = true;
+ break;
+ }
+ }
+ // check if the split has too many inputs
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
+ const size_t id = hash_id(src);
+ int src_backend_id = sched->tensor_backend_id[id];
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
+ need_new_split = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (node_backend_id != cur_backend_id || need_new_split) {
+ split->i_end = i;
+ i_split++;
+ if (i_split >= sched->splits_capacity) {
+ sched->splits_capacity *= 2;
+ sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
+ GGML_ASSERT(sched->splits != NULL);
+ }
+ GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
+ split = &sched->splits[i_split];
+ split->backend_id = node_backend_id;
+ split->i_start = i;
+ split->n_inputs = 0;
+ cur_backend_id = node_backend_id;
}
// find inputs that are not on the same backend
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue;
}
- int src_backend_id = tensor_backend_id(src);
+ const int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now
- if (src->flags & GGML_TENSOR_FLAG_INPUT) {
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
- tensor_backend_id(tensor_copy) = src_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_graph_inputs = sched->n_graph_inputs++;
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
- if (src_backend_id != tensor_backend_id) {
+ if (src_backend_id != node_backend_id) {
// create a copy of the input in the split's backend
- size_t id = hash_id(src);
+ const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
- tensor_backend_id(tensor_copy) = cur_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
}
- int n_inputs = sched->splits[cur_split].n_inputs++;
+ int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
- sched->splits[cur_split].inputs[n_inputs] = src;
+ split->inputs[n_inputs] = src;
}
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
}
}
}
- sched->splits[cur_split].i_end = graph->n_nodes;
- sched->n_splits = cur_split + 1;
+ split->i_end = graph->n_nodes;
+ sched->n_splits = i_split + 1;
}
#ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
-#ifndef NDEBUG
- // sanity check: all sources should have the same backend as the node
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
- if (tensor_backend == NULL) {
- fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
- }
- if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
- fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
- node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
- }
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
- if (src_backend != tensor_backend /* && src_backend != NULL */) {
- fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
- node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
- j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
- }
- if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
- fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
- src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
- src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
- ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
- }
- }
- }
- fflush(stderr);
-#endif
-
// create copies of the graph for each split
// TODO: avoid this copy
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
+
struct ggml_tensor * input = split->inputs[j];
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
+ const size_t input_id = hash_id(input);
+ struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
// add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input;
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
// add a dependency to the input copy so that it is allocated at the start of the split
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
for (int j = split->i_start; j < split->i_end; j++) {
+ assert(graph_copy->size > graph_copy->n_nodes);
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
}
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
ggml_backend_tensor_copy(input, input_cpy);
} else {
+ // wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
- ggml_backend_synchronize(input_backend);
}
-
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
}
}
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
// initialize hash table
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
+ sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
+
+ const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
+ sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
- GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
+ const int initial_splits_capacity = 16;
+ sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
+ sched->splits_capacity = initial_splits_capacity;
for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
+ free(sched->splits);
free(sched->hash_set.keys);
free(sched->tensor_backend_id);
free(sched->tensor_copies);
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
+
ggml_backend_sched_split_graph(sched, measure_graph);
// TODO: extract this to a separate function
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
ggml_backend_sched_split_graph(sched, graph);
diff --git a/external/whispercpp/ggml-quants.c b/external/whispercpp/ggml-quants.c
index 109dd6660..f26798acc 100644
--- a/external/whispercpp/ggml-quants.c
+++ b/external/whispercpp/ggml-quants.c
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
}
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
+#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs);
@@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
float * scales, float * weight, uint8_t * L,
const int8_t * values,
- const float * quant_weights) {
-
- const int ntry = 7;
+ const float * quant_weights,
+ const int ntry) {
float sigma2 = 0;
for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
@@ -11719,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
float max_scale = 0, amax_scale = 0;
for (int ib = 0; ib < super_block_size/block_size; ++ib) {
const float * xb = x + ib*block_size;
+ uint8_t * Lb = L + ib*block_size;
if (quant_weights) {
const float * qw = quant_weights + ib*block_size;
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
@@ -11736,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
scales[ib] = 0;
continue;
}
- float d = -max/values[0];
+ float d = ntry > 0 ? -max/values[0] : max/values[0];
float id = 1/d;
float sumqx = 0, sumq2 = 0;
for (int j = 0; j < block_size; ++j) {
float al = id*xb[j];
int l = best_index_int8(16, values, al);
+ Lb[j] = l;
float q = values[l];
float w = weight[j];
sumqx += w*q*xb[j];
@@ -11796,9 +11797,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
}
} else {
dh[0] = GGML_FP32_TO_FP16(scales[0]);
- float id = scales[0] ? 1/scales[0] : 0;
- for (int j = 0; j < super_block_size; ++j) {
- L[j] = best_index_int8(16, values, id*x[j]);
+ if (ntry > 0) {
+ float id = scales[0] ? 1/scales[0] : 0;
+ for (int j = 0; j < super_block_size; ++j) {
+ L[j] = best_index_int8(16, values, id*x[j]);
+ }
}
}
@@ -11823,7 +11826,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
for (int ibl = 0; ibl < nblock; ++ibl) {
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
- &scale, weight, L, kvalues_iq4nl, qw);
+ &scale, weight, L, kvalues_iq4nl, qw, 7);
}
src += n_per_row;
qrow += nblock*sizeof(block_iq4_nl);
@@ -11832,14 +11835,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
}
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
- assert(k % QK4_NL == 0);
- block_iq4_nl * restrict y = vy;
- quantize_row_iq4_nl_reference(x, y, k);
+ GGML_ASSERT(k%QK4_NL == 0);
+ int nblock = k/QK4_NL;
+ uint8_t L[QK4_NL];
+ float weight[QK4_NL];
+ uint16_t unused_h;
+ uint8_t * unused_l = NULL;
+ float scale;
+ block_iq4_nl * iq4 = (block_iq4_nl *)vy;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
+ &scale, weight, L, kvalues_iq4nl, NULL, -1);
+ }
}
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
assert(k % QK4_NL == 0);
- quantize_iq4_nl(x, y, 1, k, NULL);
+ quantize_row_iq4_nl(x, y, k);
}
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
@@ -11857,7 +11869,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
for (int ibl = 0; ibl < nblock; ++ibl) {
const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
- scales, weight, L, kvalues_iq4nl, qw);
+ scales, weight, L, kvalues_iq4nl, qw, 7);
}
src += n_per_row;
qrow += nblock*sizeof(block_iq4_xs);
diff --git a/external/whispercpp/ggml.c b/external/whispercpp/ggml.c
index fbc66f65b..62b833959 100644
--- a/external/whispercpp/ggml.c
+++ b/external/whispercpp/ggml.c
@@ -3,6 +3,7 @@
#include "ggml-impl.h"
#include "ggml-quants.h"
+#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include // using malloc.h with MSC/MINGW
@@ -43,6 +44,10 @@
#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+ #define NOMINMAX
+#endif
#include
typedef volatile LONG atomic_int;
@@ -282,14 +287,10 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#else
#include
#endif
-#elif defined(GGML_USE_CUBLAS)
-#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
-#include "ggml-sycl.h"
#endif
// floating point type used to accumulate sums
@@ -432,6 +433,57 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0
#endif
+//
+// cross-platform UTF-8 file paths
+//
+
+#ifdef _WIN32
+static wchar_t * ggml_mbstowcs(const char * mbs) {
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
+ if (!wlen) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
+ wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
+ if (!wlen) {
+ GGML_FREE(wbuf);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ return wbuf;
+}
+#endif
+
+FILE * ggml_fopen(const char * fname, const char * mode) {
+#ifdef _WIN32
+ FILE * file = NULL;
+
+ // convert fname (UTF-8)
+ wchar_t * wfname = ggml_mbstowcs(fname);
+ if (wfname) {
+ // convert mode (ANSI)
+ wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
+ wchar_t * wmode_p = wmode;
+ do {
+ *wmode_p++ = (wchar_t)*mode;
+ } while (*mode++);
+
+ // open file
+ file = _wfopen(wfname, wmode);
+
+ GGML_FREE(wfname);
+ GGML_FREE(wmode);
+ }
+
+ return file;
+#else
+ return fopen(fname, mode);
+#endif
+}
+
//
// cache line
//
@@ -470,6 +522,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(int32_t),
.is_quantized = false,
},
+ [GGML_TYPE_I64] = {
+ .type_name = "i64",
+ .blck_size = 1,
+ .type_size = sizeof(int64_t),
+ .is_quantized = false,
+ },
+ [GGML_TYPE_F64] = {
+ .type_name = "f64",
+ .blck_size = 1,
+ .type_size = sizeof(double),
+ .is_quantized = false,
+ .nrows = 1,
+ },
[GGML_TYPE_F32] = {
.type_name = "f32",
.blck_size = 1,
@@ -918,6 +983,101 @@ inline static float vaddvq_f32(float32x4_t v) {
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
#endif
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR 16
+
+#define GGML_F32x16 __m512
+#define GGML_F32x16_ZERO _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD _mm512_loadu_ps
+#define GGML_F32x16_STORE _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD _mm512_add_ps
+#define GGML_F32x16_MUL _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ res = _mm512_reduce_add_ps(x[0]); \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x16
+#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR 16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16 __m512
+#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
+
+// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD _mm512_add_ps
+#define GGML_F32Cx16_MUL _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ res = _mm512_reduce_add_ps(x[0]); \
+} while (0)
+
+#define GGML_F16_VEC GGML_F32Cx16
+#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
+
#elif defined(__AVX__)
#define GGML_SIMD
@@ -2532,14 +2692,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
-#if defined(GGML_USE_CUBLAS)
- ggml_init_cublas();
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init_cpu_assist();
-#elif defined(GGML_USE_SYCL)
- ggml_init_sycl();
#endif
ggml_setup_op_has_task_pass();
@@ -10997,7 +11153,6 @@ static void ggml_compute_forward_out_prod_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
// TODO: #if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -11197,7 +11352,6 @@ static void ggml_compute_forward_out_prod_q_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_TYPE_INIT) {
@@ -12418,6 +12572,8 @@ static void ggml_compute_forward_alibi(
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
+ case GGML_TYPE_I64:
+ case GGML_TYPE_F64:
case GGML_TYPE_COUNT:
{
GGML_ASSERT(false);
@@ -12504,6 +12660,8 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
+ case GGML_TYPE_I64:
+ case GGML_TYPE_F64:
case GGML_TYPE_COUNT:
{
GGML_ASSERT(false);
@@ -15939,14 +16097,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return;
}
-#ifdef GGML_USE_CUBLAS
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
- if (skip_cpu) {
- return;
- }
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#elif defined(GGML_USE_VULKAN)
+#if defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
@@ -15958,14 +16109,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_VULKAN
-#ifdef GGML_USE_SYCL
- bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
- if (skip_cpu) {
- return;
- }
-#endif // GGML_USE_SYCL
switch (tensor->op) {
case GGML_OP_DUP:
{
@@ -18640,7 +18785,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
// write binary data
{
- FILE * fout = fopen(fname, "wb");
+ FILE * fout = ggml_fopen(fname, "wb");
if (!fout) {
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
@@ -18778,7 +18923,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
// read file into data
{
- FILE * fin = fopen(fname, "rb");
+ FILE * fin = ggml_fopen(fname, "rb");
if (!fin) {
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
return result;
@@ -19114,7 +19259,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
char color[16];
- FILE * fp = fopen(filename, "w");
+ FILE * fp = ggml_fopen(filename, "w");
GGML_ASSERT(fp);
fprintf(fp, "digraph G {\n");
@@ -20432,7 +20577,7 @@ struct gguf_context * gguf_init_empty(void) {
}
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
- FILE * file = fopen(fname, "rb");
+ FILE * file = ggml_fopen(fname, "rb");
if (!file) {
return NULL;
}
@@ -21387,7 +21532,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
}
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
- FILE * file = fopen(fname, "wb");
+ FILE * file = ggml_fopen(fname, "wb");
if (!file) {
GGML_ASSERT(false && "failed to open file for writing");
}
@@ -21529,15 +21674,15 @@ int ggml_cpu_has_wasm_simd(void) {
}
int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
+int ggml_cpu_has_cuda(void) {
+#if defined(GGML_USE_CUDA)
return 1;
#else
return 0;
@@ -21577,7 +21722,7 @@ int ggml_cpu_has_sycl(void) {
}
int ggml_cpu_has_gpublas(void) {
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
+ return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
ggml_cpu_has_sycl();
}
diff --git a/external/whispercpp/jni/common.cpp b/external/whispercpp/jni/common.cpp
new file mode 100644
index 000000000..9061989bb
--- /dev/null
+++ b/external/whispercpp/jni/common.cpp
@@ -0,0 +1,2604 @@
+#include "common.h"
+#include "llama.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include
+#include
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include
+#include
+#include
+#include
+#include
+#else
+#include
+#include
+#include
+#endif
+#if defined(LLAMA_USE_CURL)
+#include
+#include
+#include
+#include
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUDA_SYCL
+#endif
+
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#define GGML_USE_CUDA_SYCL_VULKAN
+#endif
+
+#if defined(LLAMA_USE_CURL)
+#ifdef __linux__
+#include
+#elif defined(_WIN32)
+#define PATH_MAX MAX_PATH
+#else
+#include
+#endif
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+#define LLAMA_CURL_MAX_HEADER_LENGTH 256
+#endif // LLAMA_USE_CURL
+
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+ // enumerate the set of thread siblings, num entries is num cores
+ std::unordered_set siblings;
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+ std::ifstream thread_siblings("/sys/devices/system/cpu"
+ + std::to_string(cpu) + "/topology/thread_siblings");
+ if (!thread_siblings.is_open()) {
+ break; // no more cpus
+ }
+ std::string line;
+ if (std::getline(thread_siblings, line)) {
+ siblings.insert(line);
+ }
+ }
+ if (!siblings.empty()) {
+ return static_cast(siblings.size());
+ }
+#elif defined(__APPLE__) && defined(__MACH__)
+ int32_t num_physical_cores;
+ size_t len = sizeof(num_physical_cores);
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+ if (result == 0) {
+ return num_physical_cores;
+ }
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+ if (result == 0) {
+ return num_physical_cores;
+ }
+#elif defined(_WIN32)
+ //TODO: Implement
+#endif
+ unsigned int n_threads = std::thread::hardware_concurrency();
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+
+void process_escapes(std::string & input) {
+ std::size_t input_len = input.length();
+ std::size_t output_idx = 0;
+
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
+ switch (input[++input_idx]) {
+ case 'n': input[output_idx++] = '\n'; break;
+ case 'r': input[output_idx++] = '\r'; break;
+ case 't': input[output_idx++] = '\t'; break;
+ case '\'': input[output_idx++] = '\''; break;
+ case '\"': input[output_idx++] = '\"'; break;
+ case '\\': input[output_idx++] = '\\'; break;
+ case 'x':
+ // Handle \x12, etc
+ if (input_idx + 2 < input_len) {
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
+ char *err_p = nullptr;
+ const long val = std::strtol(x, &err_p, 16);
+ if (err_p == x + 2) {
+ input_idx += 2;
+ input[output_idx++] = char(val);
+ break;
+ }
+ }
+ // fall through
+ default: input[output_idx++] = '\\';
+ input[output_idx++] = input[input_idx]; break;
+ }
+ } else {
+ input[output_idx++] = input[input_idx];
+ }
+ }
+
+ input.resize(output_idx);
+}
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+ bool result = true;
+ try {
+ if (!gpt_params_parse_ex(argc, argv, params)) {
+ gpt_print_usage(argc, argv, gpt_params());
+ exit(0);
+ }
+ }
+ catch (const std::invalid_argument & ex) {
+ fprintf(stderr, "%s\n", ex.what());
+ gpt_print_usage(argc, argv, gpt_params());
+ exit(1);
+ }
+ return result;
+}
+
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+ llama_sampling_params& sparams = params.sparams;
+
+ if (arg == "-s" || arg == "--seed") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.seed = std::stoul(argv[i]);
+ return true;
+ }
+ if (arg == "-t" || arg == "--threads") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads = std::stoi(argv[i]);
+ if (params.n_threads <= 0) {
+ params.n_threads = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-tb" || arg == "--threads-batch") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_batch = std::stoi(argv[i]);
+ if (params.n_threads_batch <= 0) {
+ params.n_threads_batch = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-td" || arg == "--threads-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_draft = std::stoi(argv[i]);
+ if (params.n_threads_draft <= 0) {
+ params.n_threads_draft = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-tbd" || arg == "--threads-batch-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_batch_draft = std::stoi(argv[i]);
+ if (params.n_threads_batch_draft <= 0) {
+ params.n_threads_batch_draft = std::thread::hardware_concurrency();
+ }
+ return true;
+ }
+ if (arg == "-p" || arg == "--prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.prompt = argv[i];
+ return true;
+ }
+ if (arg == "-e" || arg == "--escape") {
+ params.escape = true;
+ return true;
+ }
+ if (arg == "--prompt-cache") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.path_prompt_cache = argv[i];
+ return true;
+ }
+ if (arg == "--prompt-cache-all") {
+ params.prompt_cache_all = true;
+ return true;
+ }
+ if (arg == "--prompt-cache-ro") {
+ params.prompt_cache_ro = true;
+ return true;
+ }
+ if (arg == "-bf" || arg == "--binary-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i], std::ios::binary);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ // store the external file name in params
+ params.prompt_file = argv[i];
+ std::ostringstream ss;
+ ss << file.rdbuf();
+ params.prompt = ss.str();
+ fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
+ return true;
+ }
+ if (arg == "-f" || arg == "--file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ // store the external file name in params
+ params.prompt_file = argv[i];
+ std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt));
+ if (!params.prompt.empty() && params.prompt.back() == '\n') {
+ params.prompt.pop_back();
+ }
+ return true;
+ }
+ if (arg == "-n" || arg == "--n-predict") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_predict = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--top-k") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.top_k = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-c" || arg == "--ctx-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_ctx = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--grp-attn-n" || arg == "-gan") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.grp_attn_n = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--grp-attn-w" || arg == "-gaw") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.grp_attn_w = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--rope-freq-base") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.rope_freq_base = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--rope-freq-scale") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.rope_freq_scale = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--rope-scaling") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+ else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+ else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+ else { invalid_param = true; }
+ return true;
+ }
+ if (arg == "--rope-scale") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.rope_freq_scale = 1.0f / std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-orig-ctx") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_orig_ctx = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-ext-factor") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_ext_factor = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-attn-factor") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_attn_factor = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-beta-fast") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_beta_fast = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--yarn-beta-slow") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.yarn_beta_slow = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--pooling") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+ else { invalid_param = true; }
+ return true;
+ }
+ if (arg == "--defrag-thold" || arg == "-dt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.defrag_thold = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--samplers") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ const auto sampler_names = string_split(argv[i], ';');
+ sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
+ return true;
+ }
+ if (arg == "--sampling-seq") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
+ return true;
+ }
+ if (arg == "--top-p") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.top_p = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--min-p") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.min_p = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--temp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.temp = std::stof(argv[i]);
+ sparams.temp = std::max(sparams.temp, 0.0f);
+ return true;
+ }
+ if (arg == "--tfs") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.tfs_z = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--typical") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.typical_p = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--repeat-last-n") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_last_n = std::stoi(argv[i]);
+ sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
+ return true;
+ }
+ if (arg == "--repeat-penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_repeat = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--frequency-penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_freq = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--presence-penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.penalty_present = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--dynatemp-range") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.dynatemp_range = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--dynatemp-exp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.dynatemp_exponent = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--mirostat") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.mirostat = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--mirostat-lr") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.mirostat_eta = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--mirostat-ent") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.mirostat_tau = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "--cfg-negative-prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.cfg_negative_prompt = argv[i];
+ return true;
+ }
+ if (arg == "--cfg-negative-prompt-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt));
+ if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
+ sparams.cfg_negative_prompt.pop_back();
+ }
+ return true;
+ }
+ if (arg == "--cfg-scale") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.cfg_scale = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "-b" || arg == "--batch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_batch = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-ub" || arg == "--ubatch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_ubatch = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--keep") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_keep = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_draft = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--chunks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_chunks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-np" || arg == "--parallel") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_parallel = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-ns" || arg == "--sequences") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_sequences = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--p-split" || arg == "-ps") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.p_split = std::stof(argv[i]);
+ return true;
+ }
+ if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model = argv[i];
+ return true;
+ }
+ if (arg == "-md" || arg == "--model-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_draft = argv[i];
+ return true;
+ }
+ if (arg == "-a" || arg == "--alias") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_alias = argv[i];
+ return true;
+ }
+ if (arg == "-mu" || arg == "--model-url") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.model_url = argv[i];
+ return true;
+ }
+ if (arg == "-hfr" || arg == "--hf-repo") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hf_repo = argv[i];
+ return true;
+ }
+ if (arg == "-hff" || arg == "--hf-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hf_file = argv[i];
+ return true;
+ }
+ if (arg == "--lora") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lora_adapter.emplace_back(argv[i], 1.0f);
+ params.use_mmap = false;
+ return true;
+ }
+ if (arg == "--lora-scaled") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ const char* lora_adapter = argv[i];
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+ params.use_mmap = false;
+ return true;
+ }
+ if (arg == "--lora-base") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lora_base = argv[i];
+ return true;
+ }
+ if (arg == "--control-vector") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vectors.push_back({ 1.0f, argv[i], });
+ return true;
+ }
+ if (arg == "--control-vector-scaled") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ const char* fname = argv[i];
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vectors.push_back({ std::stof(argv[i]), fname, });
+ return true;
+ }
+ if (arg == "--control-vector-layer-range") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vector_layer_start = std::stoi(argv[i]);
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.control_vector_layer_end = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--mmproj") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.mmproj = argv[i];
+ return true;
+ }
+ if (arg == "--image") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.image = argv[i];
+ return true;
+ }
+ if (arg == "-i" || arg == "--interactive") {
+ params.interactive = true;
+ return true;
+ }
+ if (arg == "--embedding") {
+ params.embedding = true;
+ return true;
+ }
+ if (arg == "--interactive-first") {
+ params.interactive_first = true;
+ return true;
+ }
+ if (arg == "-ins" || arg == "--instruct") {
+ params.instruct = true;
+ return true;
+ }
+ if (arg == "-cml" || arg == "--chatml") {
+ params.chatml = true;
+ return true;
+ }
+ if (arg == "--infill") {
+ params.infill = true;
+ return true;
+ }
+ if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+ params.dump_kv_cache = true;
+ return true;
+ }
+ if (arg == "-nkvo" || arg == "--no-kv-offload") {
+ params.no_kv_offload = true;
+ return true;
+ }
+ if (arg == "-ctk" || arg == "--cache-type-k") {
+ params.cache_type_k = argv[++i];
+ return true;
+ }
+ if (arg == "-ctv" || arg == "--cache-type-v") {
+ params.cache_type_v = argv[++i];
+ return true;
+ }
+ if (arg == "--multiline-input") {
+ params.multiline_input = true;
+ return true;
+ }
+ if (arg == "--simple-io") {
+ params.simple_io = true;
+ return true;
+ }
+ if (arg == "-cb" || arg == "--cont-batching") {
+ params.cont_batching = true;
+ return true;
+ }
+ if (arg == "--color") {
+ params.use_color = true;
+ return true;
+ }
+ if (arg == "--mlock") {
+ params.use_mlock = true;
+ return true;
+ }
+ if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_gpu_layers = std::stoi(argv[i]);
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
+ return true;
+ }
+ if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_gpu_layers_draft = std::stoi(argv[i]);
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+ }
+ return true;
+ }
+ if (arg == "--main-gpu" || arg == "-mg") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.main_gpu = std::stoi(argv[i]);
+#ifndef GGML_USE_CUDA_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
+ return true;
+ }
+ if (arg == "--split-mode" || arg == "-sm") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string arg_next = argv[i];
+ if (arg_next == "none") {
+ params.split_mode = LLAMA_SPLIT_MODE_NONE;
+ }
+ else if (arg_next == "layer") {
+ params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+ }
+ else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+ fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+ exit(1);
+#endif // GGML_USE_SYCL
+ params.split_mode = LLAMA_SPLIT_MODE_ROW;
+ }
+ else {
+ invalid_param = true;
+ return true;
+ }
+#ifndef GGML_USE_CUDA_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
+ return true;
+ }
+ if (arg == "--tensor-split" || arg == "-ts") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string arg_next = argv[i];
+
+ // split string by , and /
+ const std::regex regex{ R"([,/]+)" };
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+ std::vector split_arg{ it, {} };
+ if (split_arg.size() >= llama_max_devices()) {
+ invalid_param = true;
+ return true;
+ }
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
+ if (i < split_arg.size()) {
+ params.tensor_split[i] = std::stof(split_arg[i]);
+ }
+ else {
+ params.tensor_split[i] = 0.0f;
+ }
+ }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+ return true;
+ }
+ if (arg == "--no-mmap") {
+ params.use_mmap = false;
+ return true;
+ }
+ if (arg == "--numa") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::string value(argv[i]);
+ /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+ else { invalid_param = true; }
+ return true;
+ }
+ if (arg == "--verbose-prompt") {
+ params.verbose_prompt = true;
+ return true;
+ }
+ if (arg == "--no-display-prompt") {
+ params.display_prompt = false;
+ return true;
+ }
+ if (arg == "-r" || arg == "--reverse-prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.antiprompt.emplace_back(argv[i]);
+ return true;
+ }
+ if (arg == "-ld" || arg == "--logdir") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.logdir = argv[i];
+
+ if (params.logdir.back() != DIRECTORY_SEPARATOR) {
+ params.logdir += DIRECTORY_SEPARATOR;
+ }
+ return true;
+ }
+ if (arg == "-lcs" || arg == "--lookup-cache-static") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lookup_cache_static = argv[i];
+ return true;
+ }
+ if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.lookup_cache_dynamic = argv[i];
+ return true;
+ }
+ if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.logits_file = argv[i];
+ return true;
+ }
+ if (arg == "--perplexity" || arg == "--all-logits") {
+ params.logits_all = true;
+ return true;
+ }
+ if (arg == "--ppl-stride") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ppl_stride = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-ptc" || arg == "--print-token-count") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_print = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--ppl-output-type") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ppl_output_type = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--hellaswag") {
+ params.hellaswag = true;
+ return true;
+ }
+ if (arg == "--hellaswag-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hellaswag_tasks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--winogrande") {
+ params.winogrande = true;
+ return true;
+ }
+ if (arg == "--winogrande-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.winogrande_tasks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--multiple-choice") {
+ params.multiple_choice = true;
+ return true;
+ }
+ if (arg == "--multiple-choice-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.multiple_choice_tasks = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--kl-divergence") {
+ params.kl_divergence = true;
+ return true;
+ }
+ if (arg == "--ignore-eos") {
+ params.ignore_eos = true;
+ return true;
+ }
+ if (arg == "--no-penalize-nl") {
+ sparams.penalize_nl = false;
+ return true;
+ }
+ if (arg == "-l" || arg == "--logit-bias") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::stringstream ss(argv[i]);
+ llama_token key;
+ char sign;
+ std::string value_str;
+ try {
+ if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+ sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+ }
+ else {
+ throw std::exception();
+ }
+ }
+ catch (const std::exception&) {
+ invalid_param = true;
+ return true;
+ }
+ return true;
+ }
+ if (arg == "-h" || arg == "--help") {
+ gpt_print_usage(argc, argv, gpt_params());
+ exit(0);
+ }
+ if (arg == "--random-prompt") {
+ params.random_prompt = true;
+ return true;
+ }
+ if (arg == "--in-prefix-bos") {
+ params.input_prefix_bos = true;
+ return true;
+ }
+ if (arg == "--in-prefix") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.input_prefix = argv[i];
+ return true;
+ }
+ if (arg == "--in-suffix") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.input_suffix = argv[i];
+ return true;
+ }
+ if (arg == "--grammar") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ sparams.grammar = argv[i];
+ return true;
+ }
+ if (arg == "--grammar-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::copy(
+ std::istreambuf_iterator(file),
+ std::istreambuf_iterator(),
+ std::back_inserter(sparams.grammar)
+ );
+ return true;
+ }
+ if (arg == "--override-kv") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ char* sep = strchr(argv[i], '=');
+ if (sep == nullptr || sep - argv[i] >= 128) {
+ fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ struct llama_model_kv_override kvo;
+ std::strncpy(kvo.key, argv[i], sep - argv[i]);
+ kvo.key[sep - argv[i]] = 0;
+ sep++;
+ if (strncmp(sep, "int:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.int_value = std::atol(sep);
+ }
+ else if (strncmp(sep, "float:", 6) == 0) {
+ sep += 6;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+ kvo.float_value = std::atof(sep);
+ }
+ else if (strncmp(sep, "bool:", 5) == 0) {
+ sep += 5;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+ if (std::strcmp(sep, "true") == 0) {
+ kvo.bool_value = true;
+ }
+ else if (std::strcmp(sep, "false") == 0) {
+ kvo.bool_value = false;
+ }
+ else {
+ fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ }
+ else {
+ fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.kv_overrides.push_back(kvo);
+ return true;
+ }
+ return false;
+}
+
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
+ bool invalid_param = false;
+ std::string arg;
+ const std::string arg_prefix = "--";
+ llama_sampling_params & sparams = params.sparams;
+
+ for (int i = 1; i < argc; i++) {
+ arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
+ }
+
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
+ throw std::invalid_argument("error: unknown argument: " + arg);
+ }
+ }
+
+ if (invalid_param) {
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+ }
+
+ if (params.prompt_cache_all &&
+ (params.interactive || params.interactive_first ||
+ params.instruct)) {
+
+ throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+ }
+
+ // short-hand to avoid specifying --hf-file -> default it to --model
+ if (!params.hf_repo.empty() && params.hf_file.empty()) {
+ params.hf_file = params.model;
+ }
+
+ if (params.escape) {
+ process_escapes(params.prompt);
+ process_escapes(params.input_prefix);
+ process_escapes(params.input_suffix);
+ process_escapes(sparams.cfg_negative_prompt);
+ for (auto & antiprompt : params.antiprompt) {
+ process_escapes(antiprompt);
+ }
+ }
+
+ if (!params.kv_overrides.empty()) {
+ params.kv_overrides.emplace_back();
+ params.kv_overrides.back().key[0] = 0;
+ }
+
+ return true;
+}
+
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+ const llama_sampling_params & sparams = params.sparams;
+
+ std::string sampler_type_chars;
+ std::string sampler_type_names;
+ for (const auto sampler_type : sparams.samplers_sequence) {
+ sampler_type_chars += static_cast(sampler_type);
+ sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
+ }
+ sampler_type_names.pop_back();
+
+ printf("\n");
+ printf("usage: %s [options]\n", argv[0]);
+ printf("\n");
+ printf("options:\n");
+ printf(" -h, --help show this help message and exit\n");
+ printf(" --version show version and build info\n");
+ printf(" -i, --interactive run in interactive mode\n");
+ printf(" --interactive-first run in interactive mode and wait for input right away\n");
+ printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
+ printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
+ printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
+ printf(" -r PROMPT, --reverse-prompt PROMPT\n");
+ printf(" halt generation at PROMPT, return control in interactive mode\n");
+ printf(" (can be specified more than once for multiple prompts).\n");
+ printf(" --color colorise output to distinguish prompt and user input from generations\n");
+ printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
+ printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
+ printf(" -tb N, --threads-batch N\n");
+ printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
+ printf(" -td N, --threads-draft N");
+ printf(" number of threads to use during generation (default: same as --threads)\n");
+ printf(" -tbd N, --threads-batch-draft N\n");
+ printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
+ printf(" -p PROMPT, --prompt PROMPT\n");
+ printf(" prompt to start generation with (default: empty)\n");
+ printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+ printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
+ printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
+ printf(" not supported with --interactive or other interactive options\n");
+ printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
+ printf(" --random-prompt start with a randomized prompt.\n");
+ printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
+ printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
+ printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
+ printf(" -f FNAME, --file FNAME\n");
+ printf(" prompt file to start generation.\n");
+ printf(" -bf FNAME, --binary-file FNAME\n");
+ printf(" binary file containing multiple choice tasks.\n");
+ printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
+ printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
+ printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
+ printf(" -ub N, --ubatch-size N\n");
+ printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
+ printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
+ printf(" (default: %s)\n", sampler_type_names.c_str());
+ printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
+ printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
+ printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
+ printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
+ printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
+ printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
+ printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
+ printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
+ printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
+ printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
+ printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
+ printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
+ printf(" --mirostat N use Mirostat sampling.\n");
+ printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+ printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
+ printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
+ printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
+ printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+ printf(" modifies the likelihood of token appearing in the completion,\n");
+ printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+ printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+ printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
+ printf(" --grammar-file FNAME file to read grammar from\n");
+ printf(" --cfg-negative-prompt PROMPT\n");
+ printf(" negative prompt to use for guidance. (default: empty)\n");
+ printf(" --cfg-negative-prompt-file FNAME\n");
+ printf(" negative prompt file to use for guidance. (default: empty)\n");
+ printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
+ printf(" --rope-scaling {none,linear,yarn}\n");
+ printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
+ printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
+ printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
+ printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
+ printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
+ printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
+ printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
+ printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
+ printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
+ printf(" --pooling {none,mean,cls}\n");
+ printf(" pooling type for embeddings, use model default if unspecified\n");
+ printf(" -dt N, --defrag-thold N\n");
+ printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
+ printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+ printf(" --no-penalize-nl do not penalize newline token\n");
+ printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
+ printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
+ printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
+ printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+ printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
+ printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
+ printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
+ printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
+ printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
+ printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+ printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
+ printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
+ printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
+ printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
+ printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
+ printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+ printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
+ printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
+ if (llama_supports_mlock()) {
+ printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ }
+ if (llama_supports_mmap()) {
+ printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+ }
+ printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
+ printf(" - distribute: spread execution evenly over all nodes\n");
+ printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
+ printf(" - numactl: use the CPU map provided by numactl\n");
+ printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
+ printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+ if (llama_supports_gpu_offload()) {
+ printf(" -ngl N, --n-gpu-layers N\n");
+ printf(" number of layers to store in VRAM\n");
+ printf(" -ngld N, --n-gpu-layers-draft N\n");
+ printf(" number of layers to store in VRAM for the draft model\n");
+ printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+ printf(" how to split the model across multiple GPUs, one of:\n");
+ printf(" - none: use one GPU only\n");
+ printf(" - layer (default): split layers and KV across GPUs\n");
+ printf(" - row: split rows across GPUs\n");
+ printf(" -ts SPLIT, --tensor-split SPLIT\n");
+ printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+ printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
+ printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+ }
+ printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
+ printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
+ printf(" -gan N, --grp-attn-n N\n");
+ printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
+ printf(" -gaw N, --grp-attn-w N\n");
+ printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
+ printf(" -dkvc, --dump-kv-cache\n");
+ printf(" verbose print of the KV cache\n");
+ printf(" -nkvo, --no-kv-offload\n");
+ printf(" disable KV offload\n");
+ printf(" -ctk TYPE, --cache-type-k TYPE\n");
+ printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
+ printf(" -ctv TYPE, --cache-type-v TYPE\n");
+ printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
+ printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
+ printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
+ printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
+ printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
+ printf(" --control-vector FNAME\n");
+ printf(" add a control vector\n");
+ printf(" --control-vector-scaled FNAME S\n");
+ printf(" add a control vector with user defined scaling S\n");
+ printf(" --control-vector-layer-range START END\n");
+ printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
+ printf(" -m FNAME, --model FNAME\n");
+ printf(" model path (default: %s)\n", params.model.c_str());
+ printf(" -md FNAME, --model-draft FNAME\n");
+ printf(" draft model for speculative decoding (default: unused)\n");
+ printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
+ printf(" model download url (default: unused)\n");
+ printf(" -hfr REPO, --hf-repo REPO\n");
+ printf(" Hugging Face model repository (default: unused)\n");
+ printf(" -hff FILE, --hf-file FILE\n");
+ printf(" Hugging Face model file (default: unused)\n");
+ printf(" -ld LOGDIR, --logdir LOGDIR\n");
+ printf(" path under which to save YAML logs (no logging if unset)\n");
+ printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
+ printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+ printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
+ printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
+ printf(" --override-kv KEY=TYPE:VALUE\n");
+ printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
+ printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+ printf(" -ptc N, --print-token-count N\n");
+ printf(" print token count every N tokens (default: %d)\n", params.n_print);
+ printf("\n");
+
+}
+
+std::string get_system_info(const gpt_params & params) {
+ std::ostringstream os;
+
+ os << "system_info: n_threads = " << params.n_threads;
+ if (params.n_threads_batch != -1) {
+ os << " (n_threads_batch = " << params.n_threads_batch << ")";
+ }
+ os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+
+ return os.str();
+}
+
+std::string gpt_random_prompt(std::mt19937 & rng) {
+ const int r = rng() % 10;
+ switch (r) {
+ case 0: return "So";
+ case 1: return "Once upon a time";
+ case 2: return "When";
+ case 3: return "The";
+ case 4: return "After";
+ case 5: return "If";
+ case 6: return "import";
+ case 7: return "He";
+ case 8: return "She";
+ case 9: return "They";
+ }
+
+ GGML_UNREACHABLE();
+}
+
+//
+// String utils
+//
+
+std::vector string_split(std::string input, char separator) {
+ std::vector parts;
+ size_t separator_pos = input.find(separator);
+ while (separator_pos != std::string::npos) {
+ std::string part = input.substr(0, separator_pos);
+ parts.emplace_back(part);
+ input = input.substr(separator_pos + 1);
+ separator_pos = input.find(separator);
+ }
+ parts.emplace_back(input);
+ return parts;
+}
+
+std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) {
+ std::unordered_map sampler_canonical_name_map {
+ {"top_k", llama_sampler_type::TOP_K},
+ {"top_p", llama_sampler_type::TOP_P},
+ {"typical_p", llama_sampler_type::TYPICAL_P},
+ {"min_p", llama_sampler_type::MIN_P},
+ {"tfs_z", llama_sampler_type::TFS_Z},
+ {"temperature", llama_sampler_type::TEMPERATURE}
+ };
+
+ // since samplers names are written multiple ways
+ // make it ready for both system names and input names
+ std::unordered_map sampler_alt_name_map {
+ {"top-k", llama_sampler_type::TOP_K},
+ {"top-p", llama_sampler_type::TOP_P},
+ {"nucleus", llama_sampler_type::TOP_P},
+ {"typical-p", llama_sampler_type::TYPICAL_P},
+ {"typical", llama_sampler_type::TYPICAL_P},
+ {"min-p", llama_sampler_type::MIN_P},
+ {"tfs-z", llama_sampler_type::TFS_Z},
+ {"tfs", llama_sampler_type::TFS_Z},
+ {"temp", llama_sampler_type::TEMPERATURE}
+ };
+
+ std::vector sampler_types;
+ sampler_types.reserve(names.size());
+ for (const auto & name : names)
+ {
+ auto sampler_item = sampler_canonical_name_map.find(name);
+ if (sampler_item != sampler_canonical_name_map.end())
+ {
+ sampler_types.push_back(sampler_item->second);
+ }
+ else
+ {
+ if (allow_alt_names)
+ {
+ sampler_item = sampler_alt_name_map.find(name);
+ if (sampler_item != sampler_alt_name_map.end())
+ {
+ sampler_types.push_back(sampler_item->second);
+ }
+ }
+ }
+ }
+ return sampler_types;
+}
+
+std::vector sampler_types_from_chars(const std::string & names_string) {
+ std::unordered_map sampler_name_map {
+ {'k', llama_sampler_type::TOP_K},
+ {'p', llama_sampler_type::TOP_P},
+ {'y', llama_sampler_type::TYPICAL_P},
+ {'m', llama_sampler_type::MIN_P},
+ {'f', llama_sampler_type::TFS_Z},
+ {'t', llama_sampler_type::TEMPERATURE}
+ };
+
+ std::vector sampler_types;
+ sampler_types.reserve(names_string.size());
+ for (const auto & c : names_string) {
+ const auto sampler_item = sampler_name_map.find(c);
+ if (sampler_item != sampler_name_map.end()) {
+ sampler_types.push_back(sampler_item->second);
+ }
+ }
+ return sampler_types;
+}
+
+std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
+ switch (sampler_type) {
+ case llama_sampler_type::TOP_K: return "top_k";
+ case llama_sampler_type::TFS_Z: return "tfs_z";
+ case llama_sampler_type::TYPICAL_P: return "typical_p";
+ case llama_sampler_type::TOP_P: return "top_p";
+ case llama_sampler_type::MIN_P: return "min_p";
+ case llama_sampler_type::TEMPERATURE: return "temperature";
+ default : return "";
+ }
+}
+
+//
+// Model utils
+//
+
+struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
+ auto mparams = llama_model_default_params();
+
+ if (params.n_gpu_layers != -1) {
+ mparams.n_gpu_layers = params.n_gpu_layers;
+ }
+ mparams.main_gpu = params.main_gpu;
+ mparams.split_mode = params.split_mode;
+ mparams.tensor_split = params.tensor_split;
+ mparams.use_mmap = params.use_mmap;
+ mparams.use_mlock = params.use_mlock;
+ if (params.kv_overrides.empty()) {
+ mparams.kv_overrides = NULL;
+ } else {
+ GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+ mparams.kv_overrides = params.kv_overrides.data();
+ }
+
+ return mparams;
+}
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+ if (s == "f32") {
+ return GGML_TYPE_F32;
+ }
+ if (s == "f16") {
+ return GGML_TYPE_F16;
+ }
+ if (s == "q8_0") {
+ return GGML_TYPE_Q8_0;
+ }
+ if (s == "q4_0") {
+ return GGML_TYPE_Q4_0;
+ }
+ if (s == "q4_1") {
+ return GGML_TYPE_Q4_1;
+ }
+ if (s == "iq4_nl") {
+ return GGML_TYPE_IQ4_NL;
+ }
+ if (s == "q5_0") {
+ return GGML_TYPE_Q5_0;
+ }
+ if (s == "q5_1") {
+ return GGML_TYPE_Q5_1;
+ }
+
+ throw std::runtime_error("Invalid cache type: " + s);
+}
+
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
+ auto cparams = llama_context_default_params();
+
+ cparams.n_ctx = params.n_ctx;
+ cparams.n_seq_max = params.n_parallel;
+ cparams.n_batch = params.n_batch;
+ cparams.n_ubatch = params.n_ubatch;
+ cparams.n_threads = params.n_threads;
+ cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ cparams.seed = params.seed;
+ cparams.logits_all = params.logits_all;
+ cparams.embeddings = params.embedding;
+ cparams.rope_scaling_type = params.rope_scaling_type;
+ cparams.rope_freq_base = params.rope_freq_base;
+ cparams.rope_freq_scale = params.rope_freq_scale;
+ cparams.yarn_ext_factor = params.yarn_ext_factor;
+ cparams.yarn_attn_factor = params.yarn_attn_factor;
+ cparams.yarn_beta_fast = params.yarn_beta_fast;
+ cparams.yarn_beta_slow = params.yarn_beta_slow;
+ cparams.yarn_orig_ctx = params.yarn_orig_ctx;
+ cparams.pooling_type = params.pooling_type;
+ cparams.defrag_thold = params.defrag_thold;
+ cparams.offload_kqv = !params.no_kv_offload;
+
+ cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
+ cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+
+ return cparams;
+}
+
+void llama_batch_clear(struct llama_batch & batch) {
+ batch.n_tokens = 0;
+}
+
+void llama_batch_add(
+ struct llama_batch & batch,
+ llama_token id,
+ llama_pos pos,
+ const std::vector & seq_ids,
+ bool logits) {
+ batch.token [batch.n_tokens] = id;
+ batch.pos [batch.n_tokens] = pos;
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+ }
+ batch.logits [batch.n_tokens] = logits;
+
+ batch.n_tokens++;
+}
+
+#ifdef LLAMA_USE_CURL
+
+static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+ bool force_download = false;
+
+ // Set the URL, allow to follow http redirection
+ curl_easy_setopt(curl, CURLOPT_URL, url);
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
+#if defined(_WIN32)
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+ // operating system. Currently implemented under MS-Windows.
+ curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+
+ // Check if the file already exists locally
+ struct stat model_file_info;
+ auto file_exists = (stat(path, &model_file_info) == 0);
+
+ // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
+ char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+ char etag_path[PATH_MAX] = {0};
+ snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
+
+ char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+ char last_modified_path[PATH_MAX] = {0};
+ snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
+
+ if (file_exists) {
+ auto * f_etag = fopen(etag_path, "r");
+ if (f_etag) {
+ if (!fgets(etag, sizeof(etag), f_etag)) {
+ fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
+ } else {
+ fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
+ }
+ fclose(f_etag);
+ }
+
+ auto * f_last_modified = fopen(last_modified_path, "r");
+ if (f_last_modified) {
+ if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
+ fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
+ } else {
+ fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
+ last_modified);
+ }
+ fclose(f_last_modified);
+ }
+ }
+
+ // Send a HEAD request to retrieve the etag and last-modified headers
+ struct llama_load_model_from_url_headers {
+ char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+ char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+ };
+ llama_load_model_from_url_headers headers;
+ {
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+ llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
+
+ // Convert header field name to lowercase
+ for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
+ buffer[i] = tolower(buffer[i]);
+ }
+
+ const char * etag_prefix = "etag: ";
+ if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
+ strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
+ }
+
+ const char * last_modified_prefix = "last-modified: ";
+ if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
+ strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
+ n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
+ }
+ return n_items;
+ };
+
+ curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+ curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
+ curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast(header_callback));
+ curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
+
+ CURLcode res = curl_easy_perform(curl);
+ if (res != CURLE_OK) {
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+ return false;
+ }
+
+ long http_code = 0;
+ curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
+ if (http_code != 200) {
+ // HEAD not supported, we don't know if the file has changed
+ // force trigger downloading
+ force_download = true;
+ fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+ }
+ }
+
+ // If the ETag or the Last-Modified headers are different: trigger a new download
+ bool should_download = !file_exists
+ || force_download
+ || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
+ || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+ if (should_download) {
+ char path_temporary[PATH_MAX] = {0};
+ snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
+ if (file_exists) {
+ fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
+ if (remove(path) != 0) {
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+ return false;
+ }
+ }
+
+ // Set the output file
+ auto * outfile = fopen(path_temporary, "wb");
+ if (!outfile) {
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+ return false;
+ }
+
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+ auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+ return fwrite(data, size, nmemb, (FILE *)fd);
+ };
+ curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast(write_callback));
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
+
+ // display download progress
+ curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+
+ // helper function to hide password in URL
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+ std::size_t protocol_pos = url.find("://");
+ if (protocol_pos == std::string::npos) {
+ return url; // Malformed URL
+ }
+
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
+ if (at_pos == std::string::npos) {
+ return url; // No password in URL
+ }
+
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+ };
+
+ // start the download
+ fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+ llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
+ auto res = curl_easy_perform(curl);
+ if (res != CURLE_OK) {
+ fclose(outfile);
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+ return false;
+ }
+
+ long http_code = 0;
+ curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+ if (http_code < 200 || http_code >= 400) {
+ fclose(outfile);
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
+ return false;
+ }
+
+ // Clean up
+ fclose(outfile);
+
+ // Write the new ETag to the .etag file
+ if (strlen(headers.etag) > 0) {
+ auto * etag_file = fopen(etag_path, "w");
+ if (etag_file) {
+ fputs(headers.etag, etag_file);
+ fclose(etag_file);
+ fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
+ }
+ }
+
+ // Write the new lastModified to the .etag file
+ if (strlen(headers.last_modified) > 0) {
+ auto * last_modified_file = fopen(last_modified_path, "w");
+ if (last_modified_file) {
+ fputs(headers.last_modified, last_modified_file);
+ fclose(last_modified_file);
+ fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
+ headers.last_modified);
+ }
+ }
+
+ if (rename(path_temporary, path) != 0) {
+ curl_easy_cleanup(curl);
+ fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+struct llama_model * llama_load_model_from_url(
+ const char * model_url,
+ const char * path_model,
+ const struct llama_model_params & params) {
+ // Basic validation of the model_url
+ if (!model_url || strlen(model_url) == 0) {
+ fprintf(stderr, "%s: invalid model_url\n", __func__);
+ return NULL;
+ }
+
+ // Initialize libcurl
+ auto * curl = curl_easy_init();
+
+ if (!curl) {
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ return NULL;
+ }
+
+ if (!curl) {
+ fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ return NULL;
+ }
+
+ if (!llama_download_file(curl, model_url, path_model)) {
+ return NULL;
+ }
+
+ // check for additional GGUFs split to download
+ int n_split = 0;
+ {
+ struct gguf_init_params gguf_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ NULL,
+ };
+ auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+ if (!ctx_gguf) {
+ fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
+ curl_easy_cleanup(curl);
+ return NULL;
+ }
+
+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+ if (key_n_split >= 0) {
+ n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+ }
+
+ gguf_free(ctx_gguf);
+ }
+
+ curl_easy_cleanup(curl);
+
+ if (n_split > 1) {
+ char split_prefix[PATH_MAX] = {0};
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+ // Verify the first split file format
+ // and extract split URL and PATH prefixes
+ {
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+ fprintf(stderr, "\n%s: unexpected model file name: %s"
+ " n_split=%d\n", __func__, path_model, n_split);
+ return NULL;
+ }
+
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+ fprintf(stderr, "\n%s: unexpected model url: %s"
+ " n_split=%d\n", __func__, model_url, n_split);
+ return NULL;
+ }
+ }
+
+ // Prepare download in parallel
+ std::vector> futures_download;
+ for (int idx = 1; idx < n_split; idx++) {
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+ char split_path[PATH_MAX] = {0};
+ llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+ auto * curl = curl_easy_init();
+ bool res = llama_download_file(curl, split_url, split_path);
+ curl_easy_cleanup(curl);
+
+ return res;
+ }, idx));
+ }
+
+ // Wait for all downloads to complete
+ for (auto & f : futures_download) {
+ if (!f.get()) {
+ return NULL;
+ }
+ }
+ }
+
+ return llama_load_model_from_file(path_model, params);
+}
+
+struct llama_model * llama_load_model_from_hf(
+ const char * repo,
+ const char * model,
+ const char * path_model,
+ const struct llama_model_params & params) {
+ // construct hugging face model url:
+ //
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+ //
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+ //
+
+ std::string model_url = "https://huggingface.co/";
+ model_url += repo;
+ model_url += "/resolve/main/";
+ model_url += model;
+
+ return llama_load_model_from_url(model_url.c_str(), path_model, params);
+}
+
+#else
+
+struct llama_model * llama_load_model_from_url(
+ const char * /*model_url*/,
+ const char * /*path_model*/,
+ const struct llama_model_params & /*params*/) {
+ fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+ return nullptr;
+}
+
+struct llama_model * llama_load_model_from_hf(
+ const char * /*repo*/,
+ const char * /*model*/,
+ const char * /*path_model*/,
+ const struct llama_model_params & /*params*/) {
+ fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+ return nullptr;
+}
+
+#endif // LLAMA_USE_CURL
+
+std::tuple llama_init_from_gpt_params(gpt_params & params) {
+ auto mparams = llama_model_params_from_gpt_params(params);
+
+ llama_model * model = nullptr;
+
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+ } else if (!params.model_url.empty()) {
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+ } else {
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
+ }
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ auto cparams = llama_context_params_from_gpt_params(params);
+
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
+ if (lctx == NULL) {
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ if (!params.control_vectors.empty()) {
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
+
+ const auto cvec = llama_control_vector_load(params.control_vectors);
+ if (cvec.n_embd == -1) {
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ int err = llama_control_vector_apply(lctx,
+ cvec.data.data(),
+ cvec.data.size(),
+ cvec.n_embd,
+ params.control_vector_layer_start,
+ params.control_vector_layer_end);
+ if (err) {
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+ }
+
+ for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+ float lora_scale = std::get<1>(params.lora_adapter[i]);
+ int err = llama_model_apply_lora_from_file(model,
+ lora_adapter.c_str(),
+ lora_scale,
+ ((i > 0) || params.lora_base.empty())
+ ? NULL
+ : params.lora_base.c_str(),
+ params.n_threads);
+ if (err != 0) {
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+ }
+
+ if (params.ignore_eos) {
+ params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+ }
+
+ {
+ LOGGD("warming up the model with an empty run\n");
+
+ std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+ llama_kv_cache_clear(lctx);
+ llama_synchronize(lctx);
+ llama_reset_timings(lctx);
+ }
+
+ return std::make_tuple(model, lctx);
+}
+
+//
+// Vocab utils
+//
+
+std::vector llama_tokenize(
+ const struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos,
+ bool special) {
+ return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
+}
+
+std::vector llama_tokenize(
+ const struct llama_model * model,
+ const std::string & text,
+ bool add_bos,
+ bool special) {
+ // upper limit for the number of tokens
+ int n_tokens = text.length() + add_bos;
+ std::vector result(n_tokens);
+ n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+ return result;
+}
+
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+ std::vector result(8, 0);
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+ if (n_tokens < 0) {
+ result.resize(-n_tokens);
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+ GGML_ASSERT(check == -n_tokens);
+ } else {
+ result.resize(n_tokens);
+ }
+
+ return std::string(result.data(), result.size());
+}
+
+std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) {
+ const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
+
+ std::string piece;
+ std::string result;
+
+ for (size_t i = 0; i < tokens.size(); ++i) {
+ piece = llama_token_to_piece(ctx, tokens[i]);
+
+ // remove the leading space of the first non-BOS token
+ if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
+ piece = piece.substr(1);
+ }
+
+ result += piece;
+ }
+
+ return result;
+}
+
+std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) {
+ std::string piece;
+ std::string result;
+
+ for (size_t i = 0; i < tokens.size(); ++i) {
+ piece = llama_token_to_piece(ctx, tokens[i]);
+
+ result += piece;
+ }
+
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+ return result;
+}
+
+bool llama_should_add_bos_token(const llama_model * model) {
+ const int add_bos = llama_add_bos_token(model);
+
+ return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+}
+
+//
+// YAML utils
+//
+
+// returns true if successful, false otherwise
+bool create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+ std::wstring_convert> converter;
+ std::wstring wpath = converter.from_bytes(path);
+
+ // if the path already exists, check whether it's a directory
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+ return true;
+ }
+
+ size_t pos_slash = 0;
+
+ // process path from front to back, procedurally creating directories
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+ const std::wstring subpath = wpath.substr(0, pos_slash);
+ const wchar_t * test = subpath.c_str();
+
+ const bool success = CreateDirectoryW(test, NULL);
+ if (!success) {
+ const DWORD error = GetLastError();
+
+ // if the path already exists, ensure that it's a directory
+ if (error == ERROR_ALREADY_EXISTS) {
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ pos_slash += 1;
+ }
+
+ return true;
+#else
+ // if the path already exists, check whether it's a directory
+ struct stat info;
+ if (stat(path.c_str(), &info) == 0) {
+ return S_ISDIR(info.st_mode);
+ }
+
+ size_t pos_slash = 1; // skip leading slashes for directory creation
+
+ // process path from front to back, procedurally creating directories
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+ const std::string subpath = path.substr(0, pos_slash);
+ struct stat info;
+
+ // if the path already exists, ensure that it's a directory
+ if (stat(subpath.c_str(), &info) == 0) {
+ if (!S_ISDIR(info.st_mode)) {
+ return false;
+ }
+ } else {
+ // create parent directories
+ const int ret = mkdir(subpath.c_str(), 0755);
+ if (ret != 0) {
+ return false;
+ }
+ }
+
+ pos_slash += 1;
+ }
+
+ return true;
+#endif // _WIN32
+}
+
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) {
+ if (data.empty()) {
+ fprintf(stream, "%s:\n", prop_name);
+ return;
+ }
+
+ fprintf(stream, "%s: [", prop_name);
+ for (size_t i = 0; i < data.size() - 1; ++i) {
+ fprintf(stream, "%e, ", data[i]);
+ }
+ fprintf(stream, "%e]\n", data.back());
+}
+
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data) {
+ if (data.empty()) {
+ fprintf(stream, "%s:\n", prop_name);
+ return;
+ }
+
+ fprintf(stream, "%s: [", prop_name);
+ for (size_t i = 0; i < data.size() - 1; ++i) {
+ fprintf(stream, "%d, ", data[i]);
+ }
+ fprintf(stream, "%d]\n", data.back());
+}
+
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
+ std::string data_str(data == NULL ? "" : data);
+
+ if (data_str.empty()) {
+ fprintf(stream, "%s:\n", prop_name);
+ return;
+ }
+
+ size_t pos_start = 0;
+ size_t pos_found = 0;
+
+ if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
+ data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
+ data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
+ data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
+ data_str = "\"" + data_str + "\"";
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+ return;
+ }
+
+ if (data_str.find('\n') == std::string::npos) {
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+ return;
+ }
+
+ fprintf(stream, "%s: |\n", prop_name);
+ while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
+ fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
+ pos_start = pos_found + 1;
+ }
+}
+
+std::string get_sortable_timestamp() {
+ using clock = std::chrono::system_clock;
+
+ const clock::time_point current_time = clock::now();
+ const time_t as_time_t = clock::to_time_t(current_time);
+ char timestamp_no_ns[100];
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+
+ const int64_t ns = std::chrono::duration_cast(
+ current_time.time_since_epoch() % 1000000000).count();
+ char timestamp_ns[11];
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
+
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+}
+
+//
+// KV cache utils
+//
+
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
+ static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
+
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+ llama_kv_cache_view_cell * c_curr = view.cells;
+ llama_seq_id * cs_curr = view.cells_sequences;
+
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+ if (i % row_size == 0) {
+ printf("\n%5d: ", i);
+ }
+ int seq_count = 0;
+ for (int j = 0; j < view.n_seq_max; j++) {
+ if (cs_curr[j] >= 0) { seq_count++; }
+ }
+ putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
+ }
+
+ printf("\n=== Done dumping\n");
+}
+
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+ static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+ std::unordered_map seqs;
+ llama_kv_cache_view_cell * c_curr = view.cells;
+ llama_seq_id * cs_curr = view.cells_sequences;
+
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+ for (int j = 0; j < view.n_seq_max; j++) {
+ if (cs_curr[j] < 0) { continue; }
+ if (seqs.find(cs_curr[j]) == seqs.end()) {
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+ const size_t sz = seqs.size();
+ seqs[cs_curr[j]] = sz;
+ }
+ }
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+ }
+
+ printf("=== Sequence legend: ");
+ for (const auto & it : seqs) {
+ printf("%zu=%d, ", it.second, it.first);
+ }
+ printf("'+'=other sequence ids");
+
+ c_curr = view.cells;
+ cs_curr = view.cells_sequences;
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+ if (i % row_size == 0) {
+ printf("\n%5d: ", i);
+ }
+ for (int j = 0; j < view.n_seq_max; j++) {
+ if (cs_curr[j] >= 0) {
+ const auto & it = seqs.find(cs_curr[j]);
+ putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
+ } else {
+ putchar('.');
+ }
+ }
+ putchar(' ');
+ }
+
+ printf("\n=== Done dumping\n");
+}
+
+void llama_embd_normalize(const float * inp, float * out, int n) {
+ double sum = 0.0;
+ for (int i = 0; i < n; i++) {
+ sum += inp[i] * inp[i];
+ }
+ sum = sqrt(sum);
+
+ const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
+
+ for (int i = 0; i < n; i++) {
+ out[i] = inp[i] * norm;
+ }
+}
+
+float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
+ double sum = 0.0;
+ double sum1 = 0.0;
+ double sum2 = 0.0;
+
+ for (int i = 0; i < n; i++) {
+ sum += embd1[i] * embd2[i];
+ sum1 += embd1[i] * embd1[i];
+ sum2 += embd2[i] * embd2[i];
+ }
+
+ return sum / (sqrt(sum1) * sqrt(sum2));
+}
+
+//
+// Control vector utils
+//
+
+static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
+ int32_t n_tensors;
+
+ size_t n_bytes = 0;
+
+ uint32_t max_direction_layer = 0;
+
+ llama_control_vector_data result = { -1, {} };
+
+ // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
+ {
+ struct ggml_init_params meta_params = {
+ /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
+ /* .mem_buffer = */ nullptr,
+ /* .no_alloc = */ true,
+ };
+ ggml_context * meta_ctx = ggml_init(meta_params);
+ struct gguf_init_params meta_gguf_params = {
+ /* .no_alloc = */ true,
+ /* .ctx = */ &meta_ctx,
+ };
+ struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+ if (!meta_ctx_gguf) {
+ fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
+ ggml_free(meta_ctx);
+ return result;
+ }
+
+ n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
+ for (int i = 0; i < n_tensors; i++) {
+ std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
+
+ // split on '.'
+ size_t dotpos = name.find('.');
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+ try {
+ uint32_t layer = std::stoi(name.substr(dotpos + 1));
+ if (layer == 0) {
+ fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+ ggml_free(meta_ctx);
+ gguf_free(meta_ctx_gguf);
+ return result;
+ }
+ if (layer > max_direction_layer) {
+ max_direction_layer = layer;
+ }
+ } catch (...) {
+ fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+ ggml_free(meta_ctx);
+ gguf_free(meta_ctx_gguf);
+ return result;
+ }
+ }
+
+ struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
+ if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
+ fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+ ggml_free(meta_ctx);
+ gguf_free(meta_ctx_gguf);
+ return result;
+ }
+ if (result.n_embd == -1) {
+ result.n_embd = ggml_nelements(tensor_meta);
+ } else if (ggml_nelements(tensor_meta) != result.n_embd) {
+ fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
+ ggml_free(meta_ctx);
+ gguf_free(meta_ctx_gguf);
+ return result;
+ }
+ n_bytes += ggml_nbytes(tensor_meta);
+ }
+ ggml_free(meta_ctx);
+ gguf_free(meta_ctx_gguf);
+ }
+
+ if (n_tensors == 0) {
+ fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+ return result;
+ }
+
+ // load and scale tensors into final control vector context
+ struct ggml_init_params ggml_params = {
+ /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
+ /* .mem_buffer = */ nullptr,
+ /* .no_alloc = */ false,
+ };
+ struct ggml_context * ctx = ggml_init(ggml_params);
+
+ struct gguf_init_params params = {
+ /*.no_alloc = */ false,
+ /*.ctx = */ &ctx,
+ };
+ struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
+ if (!ctx_gguf) {
+ fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
+ ggml_free(ctx);
+ return result;
+ }
+
+ // do not store data for layer 0 (it's not used)
+ result.data.resize(result.n_embd * max_direction_layer);
+
+ for (uint32_t il = 1; il <= max_direction_layer; il++) {
+ const std::string name = "direction." + std::to_string(il);
+ const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+
+ float * dst = result.data.data() + result.n_embd * (il - 1);
+
+ if (tensor) {
+ const float * src = (const float *) tensor->data;
+ for (int j = 0; j < result.n_embd; j++) {
+ dst[j] = src[j] * load_info.strength;
+ }
+ } else {
+ for (int j = 0; j < result.n_embd; j++) {
+ dst[j] = 0.0f;
+ }
+ }
+ }
+
+ return result;
+}
+
+llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) {
+ llama_control_vector_data result = { -1, {} };
+
+ for (const auto & info : load_infos) {
+ auto cur = llama_control_vector_load_one(info);
+
+ if (cur.n_embd == -1) {
+ return result;
+ }
+ if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
+ fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
+ return result;
+ }
+
+ if (result.n_embd == -1) {
+ result = std::move(cur);
+ } else {
+ for (size_t i = 0; i < cur.data.size(); i++) {
+ result.data[i] += cur.data[i];
+ }
+ }
+ }
+
+ if (result.n_embd == -1) {
+ fprintf(stderr, "%s: no vectors passed\n", __func__);
+ }
+
+ return result;
+}
diff --git a/external/whispercpp/jni/common.h b/external/whispercpp/jni/common.h
new file mode 100644
index 000000000..1044e3cd3
--- /dev/null
+++ b/external/whispercpp/jni/common.h
@@ -0,0 +1,318 @@
+// Various helper functions and utilities
+
+#pragma once
+
+#include "llama.h"
+
+#include "sampling.h"
+
+#include "libavutil/cde_log.h"
+
+#define LOG_NO_FILE_LINE_FUNCTION
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef _WIN32
+#define DIRECTORY_SEPARATOR '\\'
+#else
+#define DIRECTORY_SEPARATOR '/'
+#endif // _WIN32
+
+#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
+#define print_build_info() do { \
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
+} while(0)
+
+// build info
+extern int LLAMA_BUILD_NUMBER;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;
+
+struct llama_control_vector_load_info;
+
+int32_t get_num_physical_cores();
+
+//
+// CLI argument parsing
+//
+
+struct gpt_params {
+ uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
+
+ int32_t n_threads = std::thread().hardware_concurrency();
+ int32_t n_threads_draft = -1;
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
+ int32_t n_threads_batch_draft = -1;
+ int32_t n_predict = -1; // new tokens to predict
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
+ int32_t n_parallel = 1; // number of parallel sequences to decode
+ int32_t n_sequences = 1; // number of sequences to decode
+ float p_split = 0.1f; // speculative decoding split probability
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+ llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
+ int32_t grp_attn_n = 1; // group-attention factor
+ int32_t grp_attn_w = 512; // group-attention width
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
+ float rope_freq_base = 0.0f; // RoPE base frequency
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
+ float defrag_thold = -1.0f; // KV cache defragmentation threshold
+
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+
+ llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+ llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+
+ // // sampling parameters
+ struct llama_sampling_params sparams;
+
+ std::string model = "/sdcard/kantv/llama-2-7b-chat.Q4_K_M.gguf"; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
+ std::string model_alias = "unknown"; // model alias
+ std::string model_url = ""; // model url to download
+ std::string hf_repo = ""; // HF repo
+ std::string hf_file = ""; // HF file
+ std::string prompt = "";
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
+ std::vector antiprompt; // string upon seeing which more user input is prompted
+ std::string logdir = ""; // directory in which to save YAML log files
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+ std::string logits_file = ""; // file for saving *all* logits
+
+ std::vector kv_overrides;
+
+ // TODO: avoid tuple, use struct
+ std::vector> lora_adapter; // lora adapter path with user defined scale
+ std::string lora_base = ""; // base model path for the lora adapter
+
+ std::vector control_vectors; // control vector with user defined scale
+
+ int32_t control_vector_layer_start = -1; // layer range for control vector
+ int32_t control_vector_layer_end = -1; // layer range for control vector
+
+ int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ // (which is more convenient to use for plotting)
+ //
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
+
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+ size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
+ bool kl_divergence = false; // compute KL-divergence
+
+ bool random_prompt = false; // do not randomize prompt if none provided
+ bool use_color = false; // use color to distinguish generations and inputs
+ bool interactive = false; // interactive mode
+ bool chatml = false; // chatml mode (used for models trained on chatml syntax)
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
+
+ bool embedding = false; // get only sentence embedding
+ bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+ bool interactive_first = false; // wait for user input immediately
+ bool multiline_input = false; // reverse the usage of `\`
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
+
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
+ bool ignore_eos = false; // ignore generated EOS tokens
+ bool instruct = false; // instruction mode (used for Alpaca models)
+ bool logits_all = false; // return logits for all tokens in the batch
+ bool use_mmap = true; // use mmap for faster loads
+ bool use_mlock = false; // use mlock to keep model in memory
+ bool verbose_prompt = false; // print prompt tokens before generation
+ bool display_prompt = true; // print prompt before generation
+ bool infill = false; // use infill mode
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
+ bool no_kv_offload = false; // disable KV offloading
+
+ std::string cache_type_k = "f16"; // KV cache data type for the K
+ std::string cache_type_v = "f16"; // KV cache data type for the V
+
+ // multimodal models (see examples/llava)
+ std::string mmproj = ""; // path to multimodal projector
+ std::string image = ""; // path to an image file
+};
+
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
+std::string get_system_info(const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+void process_escapes(std::string& input);
+
+//
+// String utils
+//
+
+std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names);
+std::vector sampler_types_from_chars(const std::string & names_string);
+std::vector string_split(std::string input, char separator);
+std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
+
+//
+// Model utils
+//
+
+// TODO: avoid tuplue, use struct
+std::tuple llama_init_from_gpt_params(gpt_params & params);
+
+struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+
+// Batch utils
+
+void llama_batch_clear(struct llama_batch & batch);
+
+void llama_batch_add(
+ struct llama_batch & batch,
+ llama_token id,
+ llama_pos pos,
+ const std::vector & seq_ids,
+ bool logits);
+
+//
+// Vocab utils
+//
+
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
+std::vector llama_tokenize(
+ const struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos,
+ bool special = false);
+
+std::vector llama_tokenize(
+ const struct llama_model * model,
+ const std::string & text,
+ bool add_bos,
+ bool special = false);
+
+// tokenizes a token into a piece
+// should work similar to Python's `tokenizer.id_to_piece`
+std::string llama_token_to_piece(
+ const struct llama_context * ctx,
+ llama_token token);
+
+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+// that takes into account the tokenizer type and decides how to handle the leading space
+//
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize_spm(
+ llama_context * ctx,
+ const std::vector & tokens);
+
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
+ llama_context * ctx,
+ const std::vector & tokens);
+
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
+//
+// YAML utils
+//
+
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();
+
+void dump_non_result_info_yaml(
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
+ const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc);
+
+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
+//
+// Embedding utils
+//
+
+void llama_embd_normalize(const float * inp, float * out, int n);
+
+float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+
+//
+// Control vector utils
+//
+
+struct llama_control_vector_data {
+ int n_embd;
+
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+ std::vector data;
+};
+
+struct llama_control_vector_load_info {
+ float strength;
+
+ std::string fname;
+};
+
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+llama_control_vector_data llama_control_vector_load(const std::vector & load_infos);
+
+//
+// Split utils
+//
+static const char * const LLM_KV_SPLIT_NO = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
diff --git a/external/whispercpp/jni/grammar-parser.cpp b/external/whispercpp/jni/grammar-parser.cpp
new file mode 100644
index 000000000..2a1301569
--- /dev/null
+++ b/external/whispercpp/jni/grammar-parser.cpp
@@ -0,0 +1,440 @@
+#include "grammar-parser.h"
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace grammar_parser {
+ // NOTE: assumes valid utf8 (but checks for overrun)
+ // copied from llama.cpp
+ static std::pair decode_utf8(const char * src) {
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+ uint8_t first_byte = static_cast(*src);
+ uint8_t highbits = first_byte >> 4;
+ int len = lookup[highbits];
+ uint8_t mask = (1 << (8 - len)) - 1;
+ uint32_t value = first_byte & mask;
+ const char * end = src + len; // may overrun!
+ const char * pos = src + 1;
+ for ( ; pos < end && *pos; pos++) {
+ value = (value << 6) + (static_cast(*pos) & 0x3F);
+ }
+ return std::make_pair(value, pos);
+ }
+
+ static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+ uint32_t next_id = static_cast(state.symbol_ids.size());
+ auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
+ return result.first->second;
+ }
+
+ static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+ uint32_t next_id = static_cast(state.symbol_ids.size());
+ state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+ return next_id;
+ }
+
+ static void add_rule(
+ parse_state & state,
+ uint32_t rule_id,
+ const std::vector & rule) {
+ if (state.rules.size() <= rule_id) {
+ state.rules.resize(rule_id + 1);
+ }
+ state.rules[rule_id] = rule;
+ }
+
+ static bool is_word_char(char c) {
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+ }
+
+ static std::pair parse_hex(const char * src, int size) {
+ const char * pos = src;
+ const char * end = src + size;
+ uint32_t value = 0;
+ for ( ; pos < end && *pos; pos++) {
+ value <<= 4;
+ char c = *pos;
+ if ('a' <= c && c <= 'f') {
+ value += c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ value += c - 'A' + 10;
+ } else if ('0' <= c && c <= '9') {
+ value += c - '0';
+ } else {
+ break;
+ }
+ }
+ if (pos != end) {
+ throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+ }
+ return std::make_pair(value, pos);
+ }
+
+ static const char * parse_space(const char * src, bool newline_ok) {
+ const char * pos = src;
+ while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+ (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+ if (*pos == '#') {
+ while (*pos && *pos != '\r' && *pos != '\n') {
+ pos++;
+ }
+ } else {
+ pos++;
+ }
+ }
+ return pos;
+ }
+
+ static const char * parse_name(const char * src) {
+ const char * pos = src;
+ while (is_word_char(*pos)) {
+ pos++;
+ }
+ if (pos == src) {
+ throw std::runtime_error(std::string("expecting name at ") + src);
+ }
+ return pos;
+ }
+
+ static std::pair parse_char(const char * src) {
+ if (*src == '\\') {
+ switch (src[1]) {
+ case 'x': return parse_hex(src + 2, 2);
+ case 'u': return parse_hex(src + 2, 4);
+ case 'U': return parse_hex(src + 2, 8);
+ case 't': return std::make_pair('\t', src + 2);
+ case 'r': return std::make_pair('\r', src + 2);
+ case 'n': return std::make_pair('\n', src + 2);
+ case '\\':
+ case '"':
+ case '[':
+ case ']':
+ return std::make_pair(src[1], src + 2);
+ default:
+ throw std::runtime_error(std::string("unknown escape at ") + src);
+ }
+ } else if (*src) {
+ return decode_utf8(src);
+ }
+ throw std::runtime_error("unexpected end of input");
+ }
+
+ const char * parse_alternates(
+ parse_state & state,
+ const char * src,
+ const std::string & rule_name,
+ uint32_t rule_id,
+ bool is_nested);
+
+ static const char * parse_sequence(
+ parse_state & state,
+ const char * src,
+ const std::string & rule_name,
+ std::vector & out_elements,
+ bool is_nested) {
+ size_t last_sym_start = out_elements.size();
+ const char * pos = src;
+ while (*pos) {
+ if (*pos == '"') { // literal string
+ pos++;
+ last_sym_start = out_elements.size();
+ while (*pos != '"') {
+ auto char_pair = parse_char(pos);
+ pos = char_pair.second;
+ out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '[') { // char range(s)
+ pos++;
+ enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+ if (*pos == '^') {
+ pos++;
+ start_type = LLAMA_GRETYPE_CHAR_NOT;
+ }
+ last_sym_start = out_elements.size();
+ while (*pos != ']') {
+ auto char_pair = parse_char(pos);
+ pos = char_pair.second;
+ enum llama_gretype type = last_sym_start < out_elements.size()
+ ? LLAMA_GRETYPE_CHAR_ALT
+ : start_type;
+
+ out_elements.push_back({type, char_pair.first});
+ if (pos[0] == '-' && pos[1] != ']') {
+ auto endchar_pair = parse_char(pos + 1);
+ pos = endchar_pair.second;
+ out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+ }
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else if (is_word_char(*pos)) { // rule reference
+ const char * name_end = parse_name(pos);
+ uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
+ pos = parse_space(name_end, is_nested);
+ last_sym_start = out_elements.size();
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+ } else if (*pos == '(') { // grouping
+ // parse nested alternates into synthesized rule
+ pos = parse_space(pos + 1, true);
+ uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+ pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
+ last_sym_start = out_elements.size();
+ // output reference to synthesized rule
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+ if (*pos != ')') {
+ throw std::runtime_error(std::string("expecting ')' at ") + pos);
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+ if (last_sym_start == out_elements.size()) {
+ throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
+ }
+
+ // apply transformation to previous symbol (last_sym_start to end) according to
+ // rewrite rules:
+ // S* --> S' ::= S S' |
+ // S+ --> S' ::= S S' | S
+ // S? --> S' ::= S |
+ uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+ std::vector sub_rule;
+ // add preceding symbol to generated rule
+ sub_rule.insert(
+ sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+ if (*pos == '*' || *pos == '+') {
+ // cause generated rule to recurse
+ sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+ }
+ // mark start of alternate def
+ sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+ if (*pos == '+') {
+ // add preceding symbol as alternate only for '+' (otherwise empty)
+ sub_rule.insert(
+ sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+ }
+ sub_rule.push_back({LLAMA_GRETYPE_END, 0});
+ add_rule(state, sub_rule_id, sub_rule);
+
+ // in original rule, replace previous symbol with reference to generated rule
+ out_elements.resize(last_sym_start);
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+
+ pos = parse_space(pos + 1, is_nested);
+ } else {
+ break;
+ }
+ }
+ return pos;
+ }
+
+ const char * parse_alternates(
+ parse_state & state,
+ const char * src,
+ const std::string & rule_name,
+ uint32_t rule_id,
+ bool is_nested) {
+ std::vector rule;
+ const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
+ while (*pos == '|') {
+ rule.push_back({LLAMA_GRETYPE_ALT, 0});
+ pos = parse_space(pos + 1, true);
+ pos = parse_sequence(state, pos, rule_name, rule, is_nested);
+ }
+ rule.push_back({LLAMA_GRETYPE_END, 0});
+ add_rule(state, rule_id, rule);
+ return pos;
+ }
+
+ static const char * parse_rule(parse_state & state, const char * src) {
+ const char * name_end = parse_name(src);
+ const char * pos = parse_space(name_end, false);
+ size_t name_len = name_end - src;
+ uint32_t rule_id = get_symbol_id(state, src, name_len);
+ const std::string name(src, name_len);
+
+ if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+ throw std::runtime_error(std::string("expecting ::= at ") + pos);
+ }
+ pos = parse_space(pos + 3, true);
+
+ pos = parse_alternates(state, pos, name, rule_id, false);
+
+ if (*pos == '\r') {
+ pos += pos[1] == '\n' ? 2 : 1;
+ } else if (*pos == '\n') {
+ pos++;
+ } else if (*pos) {
+ throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+ }
+ return parse_space(pos, true);
+ }
+
+ parse_state parse(const char * src) {
+ try {
+ parse_state state;
+ const char * pos = parse_space(src, true);
+ while (*pos) {
+ pos = parse_rule(state, pos);
+ }
+ // Validate the state to ensure that all rules are defined
+ for (const auto & rule : state.rules) {
+ for (const auto & elem : rule) {
+ if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+ // Ensure that the rule at that location exists
+ if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
+ // Get the name of the rule that is missing
+ for (const auto & kv : state.symbol_ids) {
+ if (kv.second == elem.value) {
+ throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+ }
+ }
+ }
+ }
+ }
+ }
+ return state;
+ } catch (const std::exception & err) {
+ fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
+ return parse_state();
+ }
+ }
+
+ static void print_grammar_char(FILE * file, uint32_t c) {
+ if (0x20 <= c && c <= 0x7f) {
+ fprintf(file, "%c", static_cast(c));
+ } else {
+ // cop out of encoding UTF-8
+ fprintf(file, "", c);
+ }
+ }
+
+ static bool is_char_element(llama_grammar_element elem) {
+ switch (elem.type) {
+ case LLAMA_GRETYPE_CHAR: return true;
+ case LLAMA_GRETYPE_CHAR_NOT: return true;
+ case LLAMA_GRETYPE_CHAR_ALT: return true;
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+ default: return false;
+ }
+ }
+
+ static void print_rule_binary(FILE * file, const std::vector & rule) {
+ for (auto elem : rule) {
+ switch (elem.type) {
+ case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
+ case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
+ case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
+ case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
+ case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+ case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
+ }
+ switch (elem.type) {
+ case LLAMA_GRETYPE_END:
+ case LLAMA_GRETYPE_ALT:
+ case LLAMA_GRETYPE_RULE_REF:
+ fprintf(file, "(%u) ", elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR:
+ case LLAMA_GRETYPE_CHAR_NOT:
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ case LLAMA_GRETYPE_CHAR_ALT:
+ fprintf(file, "(\"");
+ print_grammar_char(file, elem.value);
+ fprintf(file, "\") ");
+ break;
+ }
+ }
+ fprintf(file, "\n");
+ }
+
+ static void print_rule(
+ FILE * file,
+ uint32_t rule_id,
+ const std::vector & rule,
+ const std::map & symbol_id_names) {
+ if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+ throw std::runtime_error(
+ "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+ }
+ fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+ for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+ llama_grammar_element elem = rule[i];
+ switch (elem.type) {
+ case LLAMA_GRETYPE_END:
+ throw std::runtime_error(
+ "unexpected end of rule: " + std::to_string(rule_id) + "," +
+ std::to_string(i));
+ case LLAMA_GRETYPE_ALT:
+ fprintf(file, "| ");
+ break;
+ case LLAMA_GRETYPE_RULE_REF:
+ fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+ break;
+ case LLAMA_GRETYPE_CHAR:
+ fprintf(file, "[");
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_NOT:
+ fprintf(file, "[^");
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ if (i == 0 || !is_char_element(rule[i - 1])) {
+ throw std::runtime_error(
+ "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+ std::to_string(rule_id) + "," + std::to_string(i));
+ }
+ fprintf(file, "-");
+ print_grammar_char(file, elem.value);
+ break;
+ case LLAMA_GRETYPE_CHAR_ALT:
+ if (i == 0 || !is_char_element(rule[i - 1])) {
+ throw std::runtime_error(
+ "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+ std::to_string(rule_id) + "," + std::to_string(i));
+ }
+ print_grammar_char(file, elem.value);
+ break;
+ }
+ if (is_char_element(elem)) {
+ switch (rule[i + 1].type) {
+ case LLAMA_GRETYPE_CHAR_ALT:
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ break;
+ default:
+ fprintf(file, "] ");
+ }
+ }
+ }
+ fprintf(file, "\n");
+ }
+
+ void print_grammar(FILE * file, const parse_state & state) {
+ try {
+ std::map symbol_id_names;
+ for (const auto & kv : state.symbol_ids) {
+ symbol_id_names[kv.second] = kv.first;
+ }
+ for (size_t i = 0, end = state.rules.size(); i < end; i++) {
+ // fprintf(file, "%zu: ", i);
+ // print_rule_binary(file, state.rules[i]);
+ print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
+ // fprintf(file, "\n");
+ }
+ } catch (const std::exception & err) {
+ fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+ }
+ }
+
+ std::vector parse_state::c_rules() {
+ std::vector ret;
+ ret.reserve(rules.size());
+ for (const auto & rule : rules) {
+ ret.push_back(rule.data());
+ }
+ return ret;
+ }
+}
diff --git a/external/whispercpp/jni/grammar-parser.h b/external/whispercpp/jni/grammar-parser.h
new file mode 100644
index 000000000..9037d7272
--- /dev/null
+++ b/external/whispercpp/jni/grammar-parser.h
@@ -0,0 +1,29 @@
+// Implements a parser for an extended Backus-Naur form (BNF), producing the
+// binary context-free grammar format specified by llama.h. Supports character
+// ranges, grouping, and repetition operators. As an example, a grammar for
+// arithmetic might look like:
+//
+// root ::= expr
+// expr ::= term ([-+*/] term)*
+// term ::= num | "(" space expr ")" space
+// num ::= [0-9]+ space
+// space ::= [ \t\n]*
+
+#pragma once
+#include "llama.h"
+#include
+#include