sam : start self-attention impl (WIP)

sam : split QKV into separate tensors sam : compute KQ sam : relative positional embeddings in the attention (WIP) ggml : add ggml_add_rel_pos() operator sam : soft max in attention sam : KQV correct sam : encoder attention is ready sam : encoder feed-forward sam : add scratch buffers sam : encoder neck 0 works sam : finalize the encoder sam : fix bug in ggml_conv_2d_s1_ph() Now the image Encoder fully matches the original results sam : prompt encoder for single point sam : state
ggerganov · Jun 26, 2023 · dffd07e · dffd07e
1 parent 9c65226
commit dffd07e
Show file tree

Hide file tree

Showing 9 changed files with 9,913 additions and 3 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -27,3 +27,4 @@ add_subdirectory(dolly-v2)
 add_subdirectory(replit)
 add_subdirectory(mpt)
 add_subdirectory(starcoder)
+add_subdirectory(sam)
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -744,3 +744,46 @@ float similarity(const std::string & s0, const std::string & s1) {
 
     return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-i" || arg == "--inp") {
+            params.fname_inp = argv[++i];
+        } else if (arg == "-o" || arg == "--out") {
+            params.fname_out = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            sam_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void sam_print_usage(int argc, char ** argv, const sam_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
+    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
+    fprintf(stderr, "  -o FNAME, --out FNAME\n");
+    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
diff --git a/examples/common.h b/examples/common.h
@@ -11,7 +11,7 @@
 #define COMMON_SAMPLE_RATE 16000
 
 //
-// CLI argument parsing
+// GPT CLI argument parsing
 //
 
 struct gpt_params {
@@ -151,3 +151,20 @@ bool vad_simple(
 
 // compute similarity between two strings using Levenshtein distance
 float similarity(const std::string & s0, const std::string & s1);
+
+//
+// SAM argument parsing
+//
+
+struct sam_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
+    std::string fname_inp = "img.jpg";
+    std::string fname_out = "img.out";
+};
+
+bool sam_params_parse(int argc, char ** argv, sam_params & params);
+
+void sam_print_usage(int argc, char ** argv, const sam_params & params);
diff --git a/examples/sam/CMakeLists.txt b/examples/sam/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# sam
+
+set(TEST_TARGET sam)
+add_executable(${TEST_TARGET} main.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+
+#
+# sam-quantize
+
+#set(TEST_TARGET sam-quantize)
+#add_executable(${TEST_TARGET} quantize.cpp)
+#target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
diff --git a/examples/sam/convert-pth-to-ggml.py b/examples/sam/convert-pth-to-ggml.py
@@ -0,0 +1,137 @@
+# Convert a SAM model checkpoint to a ggml compatible file
+#
+
+import os
+import sys
+import code
+import json
+import torch
+import struct
+import numpy as np
+
+if len(sys.argv) < 3:
+    print("Usage: convert-pth-to-ggml.py file-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+fname_model = sys.argv[1]
+fname_out   = os.path.dirname(fname_model) + "/ggml-model.bin"
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+
+if ftype < 0 or ftype > 1:
+    print("Invalid ftype: " + str(ftype))
+    sys.exit(1)
+
+fname_out = fname_out.replace(".bin", "-" + ftype_str[ftype] + ".bin")
+
+model = torch.load(fname_model, map_location="cpu")
+
+# TODO: determine based on model data
+# TODO: add decoder / prompt encoder if needed
+hparams = {
+    "n_enc_state":      768,
+    "n_enc_layers":      12,
+    "n_enc_heads":       12,
+    "n_enc_out_chans":  256,
+
+    "n_pt_embd": 4,
+}
+
+print(hparams)
+
+for k, v in model.items():
+    print(k, v.shape)
+
+#exit()
+#code.interact(local=locals())
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_enc_state"]))
+fout.write(struct.pack("i", hparams["n_enc_layers"]))
+fout.write(struct.pack("i", hparams["n_enc_heads"]))
+fout.write(struct.pack("i", hparams["n_enc_out_chans"]))
+fout.write(struct.pack("i", hparams["n_pt_embd"]))
+fout.write(struct.pack("i", ftype))
+
+for k, v in model.items():
+    name = k
+    shape = v.shape
+
+    # TODO: export only the Encoder -- after it works we will export the other stuff
+    if name[:13] != "image_encoder" and \
+       name[:14] != "prompt_encoder":
+        continue
+
+    if name[:19] == "prompt_encoder.mask":
+        continue
+
+    print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
+
+    #data = tf.train.load_variable(dir_model, name).squeeze()
+    #data = v.numpy().squeeze()
+    data = v.numpy()
+    n_dims = len(data.shape);
+
+    # for efficiency - transpose some matrices
+    # "model/h.*/attn/c_attn/w"
+    # "model/h.*/attn/c_proj/w"
+    # "model/h.*/mlp/c_fc/w"
+    # "model/h.*/mlp/c_proj/w"
+    #if name[-14:] == "/attn/c_attn/w" or \
+    #   name[-14:] == "/attn/c_proj/w" or \
+    #   name[-11:] == "/mlp/c_fc/w" or \
+    #   name[-13:] == "/mlp/c_proj/w":
+    #    print("  Transposing")
+    #    data = data.transpose()
+
+    dshape = data.shape
+
+    # default type is fp16
+    ftype_cur = 1
+    if ftype == 0 or n_dims == 1 or \
+            name == "image_encoder.pos_embed" or \
+            name.startswith("prompt_encoder.point_embeddings"):
+        print("  Converting to float32")
+        data = data.astype(np.float32)
+        ftype_cur = 0
+    else:
+        print("  Converting to float16")
+        data = data.astype(np.float16)
+
+    # reshape the 1D bias into a 4D tensor so we can use ggml_repeat
+    # keep it in F32 since the data is small
+    if name == "image_encoder.patch_embed.proj.bias":
+        data = data.reshape(1, data.shape[0], 1, 1)
+        n_dims = len(data.shape);
+        dshape = data.shape
+
+    print("  New shape: ", dshape)
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")