Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PoC: Add Qualcomm mobile SoC native backend for GGML #149

Merged
merged 44 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
a2cf3a6
doc: update software architecuture diagram
zhouwg Apr 6, 2024
ad5bc94
project: fix issue in merge
zhouwg Apr 6, 2024
aa40397
project: refine project
zhouwg Apr 7, 2024
f68e711
doc: update software architecuture diagram
zhouwg Apr 7, 2024
850f01d
Merge branch 'master' into kantv-poc-with-qnn
zhouwg Apr 7, 2024
75799df
github-issue: fix https://github.com/zhouwg/kantv/issues/103 finally …
zhouwg Apr 7, 2024
1cbb98f
build: fix minor issue in build system
zhouwg Apr 7, 2024
4e08bd4
PoC-S26: offload simple f32 2x2 matrix addition operation to QNN CPU …
zhouwg Apr 7, 2024
5df736b
PoC-S27: offload simple f32 2x2 matrix addition operation to QNN GPU …
zhouwg Apr 8, 2024
c0dd9e2
PoC-S29&S30&32&33: mapping ggml_tensor to QNN tensor -- done
zhouwg Apr 8, 2024
a6469c6
project:fix conflict in merge
zhouwg Apr 9, 2024
23c03c1
project: sync with upstream llama.cpp/ggml
zhouwg Apr 9, 2024
336f63f
ggml-jni: better method to keep sync ggml_op between Java/JNI/ggml.h
zhouwg Apr 9, 2024
ed92fa3
project: fix merge conflict
zhouwg Apr 9, 2024
fcbe3c5
ggml-jni: build code skeleton of PoC-S35&S37:implement a complex/comp…
zhouwg Apr 9, 2024
9944269
project:fix merge conflict
zhouwg Apr 9, 2024
6900d7e
ggml-jni: fix misunderstanding from "compute graph" to "computation g…
zhouwg Apr 10, 2024
7f8d9ae
project:fix merge conflict
zhouwg Apr 10, 2024
0fce196
project: enable APK running well on any mainstream Qualcomm mobile So…
zhouwg Apr 10, 2024
911157e
PoC-S41: HLD of ggml-qnn backend done --- apk bootstrap(with qnn back…
zhouwg Apr 11, 2024
21dea39
PoC-S42: implementation of major GGML OP(mulmat...) using QNN SDK ---…
zhouwg Apr 12, 2024
2b00eb1
PoC-S45: validate PoC-S42 by llama.cpp --- not work
zhouwg Apr 13, 2024
375a5b4
PoC-S42:prepare for implementation of GGML OP using QNN SDK
zhouwg Apr 14, 2024
ce44da6
PoC-S43: implementation of major GGML OP(mulmat) using QNN CPU backend
zhouwg Apr 15, 2024
dcb0bcf
project: regular refine file and directory
zhouwg Apr 15, 2024
390fddd
project: fix typo
zhouwg Apr 15, 2024
7cb1960
PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API
zhouwg Apr 16, 2024
3957c89
PoC-S44:implementation of datapath using QNN GPU backend: UI <---> Ja…
zhouwg Apr 16, 2024
62831a6
PoC-S48:validate PoC-S42/PoC-S44/PoC-S46(aka QNN backend) by whisper.…
zhouwg Apr 17, 2024
083ad08
PoC-S48:validate PoC-S42/PoC-S44/PoC-S46(aka QNN backend) by whisper.…
zhouwg Apr 18, 2024
87a1ef9
PoC-S48:fix a typo in UT for PoC-S49:implementation of other GGML OP(…
zhouwg Apr 18, 2024
5991f6c
ggml-qnn: toggle between ggml and QNN CPU backend
zhouwg Apr 18, 2024
bb31748
ggml-qnn: qnn htp(aka dsp) backend works as expected on Xiaomi14
zhouwg Apr 19, 2024
0ca0852
ggml-qnn: qnn rpc mem works in jni function qnn_matrix
zhouwg Apr 19, 2024
c78e6ea
ggml-qnn: enable real QNN backend with GGML_OP_ADD/GGML_OP_MUL/GGML_O…
zhouwg Apr 19, 2024
e8a1918
ggml-qnn: fix compatible issue on Qualcomm SoC based low-end Android …
zhouwg Apr 20, 2024
2c9b8f6
ggml-jni: add automation UT for ggml ops --- op_add,op_mul,op_mulmat
zhouwg Apr 20, 2024
8a73098
ggml-qnn: add resource management of internal QNN resource and refine…
zhouwg Apr 21, 2024
83aa795
PoC-S61: refine code(remove dependence......) --- prepare remove LOGG…
zhouwg Apr 22, 2024
0e0033d
PoC-S52: multithread supportive using QNN CPU backend in ggml-qnn.cpp
zhouwg Apr 22, 2024
74a57c3
PoC-S53: fix stability issue during toggle between different backend(…
zhouwg Apr 22, 2024
b59482c
ggml-qnn: refine internal doc in ggml-qnn.cpp
zhouwg Apr 22, 2024
f290eb9
PoC-S54: validate with llama.cpp using QNN backend
zhouwg Apr 22, 2024
f759d4b
project: fix merge conflict
zhouwg Apr 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,5 @@ prebuilts/toolchain/emsdk/
prebuilts/toolchain/*.zip
prebuilts/toolchain/*.xz
prebuilts/toolchain/*.gz

crash.log
Original file line number Diff line number Diff line change
Expand Up @@ -260,18 +260,25 @@ public class CDEUtils {
public static final int ASR_MODE_TRANSCRIPTION_RECORD = 3; // transcription + audio record

//keep sync with ggml-jni.h
public static final int BECHMARK_ASR = 0;
public static final int BECHMARK_MEMCPY = 1;
public static final int BECHMARK_MULMAT = 2;
public static final int BECHMARK_FULL = 3;
public static final int BENCHMARK_MATRIX = 4;
public static final int BENCHMARK_LLM = 5;
public static final int BENCHMARK_STABLEDIFFUSION= 6;
public static final int BENCHMARK_QNN_SAMPLE = 7;
public static final int BENCHMARK_QNN_SAVER = 8;
public static final int BENCHMARK_QNN_MATRIX = 9;
public static final int BENCHMARK_QNN_GGML = 10;
public static final int BENCHMAKR_QNN_COMPLEX = 11;
public static final int BENCHMARK_ASR = 0;
public static final int BENCHMARK_MEMCPY = 1;
public static final int BENCHMARK_MULMAT = 2;
public static final int BENCHMARK_FULL = 3;
public static final int BENCHMARK_LLM = 4;
public static final int BENCHMARK_STABLEDIFFUSION= 5;
public static final int BENCHMARK_QNN_SAMPLE = 6;
public static final int BENCHMARK_QNN_SAVER = 7;
public static final int BENCHMARK_QNN_MATRIX = 8;
public static final int BENCHMARK_QNN_GGML = 9;
public static final int BENCHMARK_QNN_COMPLEX = 10;
public static final int BENCHMARK_QNN_GGML_OP = 11;
public static final int BENCHMARK_QNN_AUTO_UT = 12;

//keep sync with ggml-qnn.h
public static final int QNN_BACKEND_CPU = 0;
public static final int QNN_BACKEND_GPU = 1;
public static final int QNN_BACKEND_HTP = 2;
public static final int QNN_BACKEND_GGML = 3; //"fake" QNN backend, just for compare performance between QNN and original GGML


private static int mASRMode = ASR_MODE_NORMAL;
Expand Down Expand Up @@ -3909,20 +3916,17 @@ public void onClick(DialogInterface dialog, int which) {

public static String getBenchmarkDesc(int benchmarkIndex) {
switch (benchmarkIndex) {
case BECHMARK_FULL:
return "GGML whisper_encode";
case BENCHMARK_FULL:
return "GGML whisper full";

case BECHMARK_MEMCPY:
return "GGML memcopy";
case BENCHMARK_MEMCPY:
return "GGML memcpy";

case BECHMARK_MULMAT:
case BENCHMARK_MULMAT:
return "GGML matrix multiply";

case BECHMARK_ASR:
return "GGML ASR inference";

case BENCHMARK_MATRIX:
return "GGML matrix";
case BENCHMARK_ASR:
return "GGML whisper ASR";

case BENCHMARK_LLM:
return "GGML LLAMA";
Expand All @@ -3936,20 +3940,53 @@ public static String getBenchmarkDesc(int benchmarkIndex) {
case BENCHMARK_QNN_SAVER:
return "GGML QNN saver";


case BENCHMARK_QNN_MATRIX:
return "GGML QNN matrix manipulate";
return "GGML QNN matrix addition";

case BENCHMARK_QNN_GGML:
return "GGML QNN ggml";
return "GGML QNN mapping ggml tensor";

case BENCHMAKR_QNN_COMPLEX:
case BENCHMARK_QNN_COMPLEX:
return "GGML QNN complex graph";

case BENCHMARK_QNN_GGML_OP:
return "GGML QNN OP UT"; //UT for PoC-S49: implementation of GGML OPs using QNN API

case BENCHMARK_QNN_AUTO_UT:
return "GGML QNN OP UT automation"; //automation UT for PoC-S49: implementation of GGML OPs using QNN API
}

return "unknown";
}


//keep sync with ggml-qnn.cpp
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently
public static String getBackendDesc(int n_backend_type) {
switch (n_backend_type) {
case 0:
return "QNN-CPU";
case 1:
return "QNN-GPU";
case 2:
return "QNN-HTP(DSP)";
case 3:
return "ggml"; //fake QNN backend, just used to compare performance between QNN and original GGML

/*
case 3:
return "QNN-cDSP";
case 4:
return "QNN-HTA";
*/

default:
return "unknown";
}
}


public static String getGGMLModeString(int ggmlModeType) {
switch (ggmlModeType) {
case 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,16 @@ public enum ggml_op {
GGML_OP_COUNT,
};

public static native int asr_init(String strModelPath, int nThreadCounts, int nASRMode);

public static native int asr_init(String strModelPath, int nThreadCounts, int nASRMode, int nBackendType);

public static native void asr_finalize();

public static native void asr_start();

public static native void asr_stop();

public static native int asr_reset(String strModelPath, int nThreadCounts, int nASRMode);
public static native int asr_reset(String strModelPath, int nThreadCounts, int nASRMode, int nBackendType);

public static native String asr_get_systeminfo();

Expand All @@ -118,7 +119,7 @@ public enum ggml_op {
/**
* @param modelPath /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
* @param audioPath /sdcard/kantv/jfk.wav
* @param nBenchType 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex
* @param nBenchType 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
* @param nThreadCounts 1 - 8
* @param nBackendType 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance)
* @param nOpType type of matrix manipulate / GGML OP / type of various complex/complicated computation graph
Expand All @@ -130,5 +131,5 @@ public enum ggml_op {
public static native String llm_get_systeminfo();


public static native String llm_inference(String modelPath, String prompt, int nBenchType, int nThreadCounts);
public static native String llm_inference(String modelPath, String prompt, int nBenchType, int nThreadCounts, int nBackendType);
}
Binary file added cdeosplayer/kantv/src/main/assets/libQnnHtp.so
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,28 @@ public void initGlobal() {
CDEAssetLoader.copyAssetFile(mContext, ggmlModelFileName, CDEUtils.getDataPath() + ggmlModelFileName);
CDEAssetLoader.copyAssetFile(mContext, ggmlSampleFileName, CDEUtils.getDataPath() + ggmlSampleFileName);


//for PoC:Add Qualcomm mobile SoC native backend for GGML, https://github.com/zhouwg/kantv/issues/121
CDEAssetLoader.copyAssetFile(mContext, "libInception_v3.so", CDEUtils.getDataPath(mContext) + "libInception_v3.so");
//qualcomm's prebuilt QNN userspace library
CDEAssetLoader.copyAssetFile(mContext, "libQnnCpu.so", CDEUtils.getDataPath(mContext) + "libQnnCpu.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnGpu.so", CDEUtils.getDataPath(mContext) + "libQnnGpu.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnDsp.so", CDEUtils.getDataPath(mContext) + "libQnnDsp.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnHtp.so", CDEUtils.getDataPath(mContext) + "libQnnHtp.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnHtpNetRunExtensions.so", CDEUtils.getDataPath(mContext) + "libQnnHtpNetRunExtensions.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnHtpPrepare.so", CDEUtils.getDataPath(mContext) + "libQnnHtpPrepare.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnHtpV75Stub.so", CDEUtils.getDataPath(mContext) + "libQnnHtpV75Stub.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnHtpV75Skel.so", CDEUtils.getDataPath(mContext) + "libQnnHtpV75Skel.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnSystem.so", CDEUtils.getDataPath(mContext) + "libQnnSystem.so");
CDEAssetLoader.copyAssetFile(mContext, "libQnnSaver.so", CDEUtils.getDataPath(mContext) + "libQnnSaver.so");
CDEAssetLoader.copyAssetFile(mContext, "params.bin", CDEUtils.getDataPath() + "params.bin");
//qualcomm's prebuilt binary file
CDEAssetLoader.copyAssetFile(mContext, "raw_list.txt", CDEUtils.getDataPath() + "raw_list.txt");
CDEAssetLoader.copyAssetDir(mContext, "data", CDEUtils.getDataPath() + "data");
//prebuilt data from https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
CDEAssetLoader.copyAssetFile(mContext, "ln.bin", CDEUtils.getDataPath() + "ln.bin");


CDEAssetLoader.copyAssetFile(mContext, "config.json", CDEAssetLoader.getDataPath(mContext) + "config.json");
String configString = CDEAssetLoader.readTextFromFile(CDEAssetLoader.getDataPath(mContext) + "config.json");
JSONObject jsonObject = JSON.parseObject(configString);
Expand Down Expand Up @@ -373,9 +395,9 @@ public void initGlobal() {
CDELog.d(TAG, "cpu core counts:" + ggmljava.get_cpu_core_counts());
CDELog.j(TAG, "asr mode: " + mSettings.getASRMode());
if ((CDEUtils.ASR_MODE_NORMAL == mSettings.getASRMode()) || (CDEUtils.ASR_MODE_TRANSCRIPTION_RECORD == mSettings.getASRMode())) {
result = ggmljava.asr_init(modelPath, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_NORMAL);
result = ggmljava.asr_init(modelPath, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_NORMAL, CDEUtils.QNN_BACKEND_GGML);
} else {
result = ggmljava.asr_init(modelPath, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_PRESURETEST);
result = ggmljava.asr_init(modelPath, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_PRESURETEST, CDEUtils.QNN_BACKEND_GGML);
}
CDEUtils.setASRConfig("whispercpp", modelPath, asrThreadCounts + 1, asrMode);
CDEUtils.setTVASR(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1577,9 +1577,9 @@ private void onASRStart(int asrMode) {

if (CDEUtils.getASRSubsystemInit()) {
if ((CDEUtils.ASR_MODE_NORMAL == mSettings.getASRMode()) || (CDEUtils.ASR_MODE_TRANSCRIPTION_RECORD == mSettings.getASRMode())) {
ggmljava.asr_reset(CDEUtils.getDataPath() + ggmlModelFileName, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_NORMAL);
ggmljava.asr_reset(CDEUtils.getDataPath() + ggmlModelFileName, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_NORMAL, CDEUtils.QNN_BACKEND_GGML);
} else {
ggmljava.asr_reset(CDEUtils.getDataPath() + ggmlModelFileName, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_PRESURETEST);
ggmljava.asr_reset(CDEUtils.getDataPath() + ggmlModelFileName, mSettings.getASRThreadCounts(), CDEUtils.ASR_MODE_PRESURETEST, CDEUtils.QNN_BACKEND_GGML);
}
ggmljava.asr_start();
} else {
Expand Down
Loading