monatis · mdhvg · Mar 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@ models/*.bin
 __pycache__
 dist
 *.gguf
+
+.vs
+**/*.env
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,8 @@
 cmake_minimum_required(VERSION 3.12)
 project("CLIP.cpp" C CXX)
 
+set(CMAKE_CXX_STANDARD 20)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@@ -22,7 +24,7 @@ endif()
 
 # general
 option(CLIP_STATIC                 "CLIP: static link libraries"                          OFF)
-option(CLIP_BUILD_TESTS             "CLIP: build tests"                                    ${CLIP_STANDALONE})
+option(CLIP_BUILD_TESTS            "CLIP: build tests"                                    ${CLIP_STANDALONE})
 option(CLIP_BUILD_EXAMPLES         "CLIP: build examples"                                 ${CLIP_STANDALONE})
 option(CLIP_BUILD_IMAGE_SEARCH     "CLIP: build image-search"                             OFF)
 option(CLIP_NATIVE                 "CLIP: enable -march=native flag"                      ON)
@@ -42,12 +44,12 @@ option(CLIP_SANITIZE_UNDEFINED     "CLIP: enable undefined sanitizer"
 option(CLIP_AVX                    "CLIP: enable AVX"                                     ON)
 option(CLIP_AVX2                   "CLIP: enable AVX2"                                    ON)
 option(CLIP_FMA                    "CLIP: enable FMA"                                     ON)
-option(CLIP_AVX512                 "clip: enable AVX512"                                  OFF)
-option(CLIP_AVX512_VBMI            "clip: enable AVX512-VBMI"                             OFF)
-option(CLIP_AVX512_VNNI            "clip: enable AVX512-VNNI"                             OFF)
+option(CLIP_AVX512                 "CLIP: enable AVX512"                                  OFF)
+option(CLIP_AVX512_VBMI            "CLIP: enable AVX512-VBMI"                             OFF)
+option(CLIP_AVX512_VNNI            "CLIP: enable AVX512-VNNI"                             OFF)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
-    option(CLIP_F16C               "clip: enable F16C"                                    ON)
+    option(CLIP_F16C               "CLIP: enable F16C"                                    ON)
 endif()
 
 

diff --git a/clip.cpp b/clip.cpp
@@ -5,15 +5,65 @@
 #include <fstream>
 #include <iostream>
 #include <map>
-#include <pthread.h>
 #include <regex>
 #include <stdexcept>
 #include <thread>
 #include <vector>
+#include <algorithm>
 
 #include "clip.h"
 #include "ggml/ggml.h"
 
+#if defined(_WIN32)
+
+#define NOMINMAX
+#include <windows.h>
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+
+static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); }
+static LONG atomic_load(atomic_int * ptr) { return InterlockedCompareExchange(ptr, 0, 0); }
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); }
+static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); }
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t (*func)(void *), void * arg) {
+    (void)unused;
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
+    if (handle == NULL) {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void * unused) {
+    (void)unused;
+    return (int)WaitForSingleObject(thread, INFINITE);
+}
+
+static int sched_yield(void) {
+    Sleep(0);
+    return 0;
+}
+
+#define pthread_exit(stat) return stat;
+#else
+#include <pthread.h>
+#include <stdatomic.h>
+
+typedef void * thread_ret_t;
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#endif
+
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
@@ -810,7 +860,7 @@ typedef struct {
 } ImageDataRange;
 
 // Function to preprocess a single image in a thread
-void * preprocess_image(void * arg) {
+thread_ret_t preprocess_image(void * arg) {
     ImageDataRange * imageDataRange = static_cast<ImageDataRange *>(arg);
 
     ImageData * imageData_start = imageDataRange->start;
@@ -1407,8 +1457,8 @@ bool clip_compare_text_and_image(const clip_ctx * ctx, const int n_threads, cons
 
     // prepare image and text vectors
     const int projection_dim = ctx->vision_model.hparams.projection_dim;
-    float img_vec[projection_dim];
-    float txt_vec[projection_dim];
+    float *img_vec = new float[projection_dim];
+    float *txt_vec = new float[projection_dim];
 
     // tokenize and encode text
     clip_tokens tokens;
@@ -1434,6 +1484,8 @@ bool clip_compare_text_and_image(const clip_ctx * ctx, const int n_threads, cons
     // compute similarity
     *score = clip_similarity_score(img_vec, txt_vec, projection_dim);
 
+    delete[] img_vec;
+    delete[] txt_vec;
     return true;
 }
 
@@ -1502,14 +1554,14 @@ bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, cons
 
     clip_image_preprocess(ctx, input_img, &img_res);
 
-    float img_vec[vec_dim];
+    float *img_vec = new float[vec_dim];
     if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, false)) {
         return false;
     }
 
     // encode texts and compute similarities
-    float txt_vec[vec_dim];
-    float similarities[n_labels];
+    float *txt_vec = new float[vec_dim];
+    float *similarities = new float[n_labels];
 
     for (int i = 0; i < n_labels; i++) {
         const auto & text = labels[i];
@@ -1522,6 +1574,10 @@ bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, cons
     // apply softmax and sort scores
     softmax_with_sorting(similarities, n_labels, scores, indices);
 
+    delete[] img_vec;
+    delete[] txt_vec;
+    delete[] similarities;
+
     return true;
 }
 

diff --git a/clip.h b/clip.h
@@ -7,6 +7,16 @@
 
 struct clip_ctx;
 
+#if defined(_WIN32)
+
+#define NOMINMAX
+#include <windows.h>
+
+typedef HANDLE pthread_t;
+typedef DWORD thread_ret_t;
+
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif

diff --git a/examples/extract.cpp b/examples/extract.cpp
@@ -28,6 +28,7 @@ int main(int argc, char ** argv) {
     int totalInputs = params.image_paths.size() + params.texts.size();
     int processedInputs = 0;
     int textCounter = 0; // Counter for generating unique filenames for text vectors
+    float * vec;
     for (const std::string & img_path : params.image_paths) {
         // load the image
         const char * img_path_cstr = img_path.c_str();
@@ -45,7 +46,7 @@ int main(int argc, char ** argv) {
 
         const int vec_dim = clip_get_vision_hparams(ctx)->projection_dim;
         int shape[2] = {1, vec_dim};
-        float vec[vec_dim];
+        vec = new float[vec_dim];
         clip_image_encode(ctx, params.n_threads, &img_res, vec, false);
 
         // Generate a unique output filename for each image
@@ -57,6 +58,7 @@ int main(int argc, char ** argv) {
         float progressPercentage = (float)processedInputs / totalInputs * 100.0f;
         printf("\rProcessing: %.2f%%", progressPercentage);
         fflush(stdout);
+        delete[] vec;
     }
 
     for (const std::string & text : params.texts) {
@@ -69,7 +71,7 @@ int main(int argc, char ** argv) {
 
         const int vec_dim = clip_get_text_hparams(ctx)->projection_dim;
         int shape[2] = {1, vec_dim};
-        float vec[vec_dim];
+        vec = new float[vec_dim];
 
         if (!clip_text_encode(ctx, params.n_threads, &tokens, vec, false)) {
             printf("Unable to encode text\n");
@@ -85,6 +87,7 @@ int main(int argc, char ** argv) {
         // Generate a unique output filename for each text
         std::string output_filename = "./text_vec_" + std::to_string(textCounter++) + ".npy";
         writeNpyFile(output_filename.c_str(), vec, shape, 2);
+        delete[] vec;
     }
 
     printf("\n"); // Print a newline to clear the progress bar line

diff --git a/examples/simple.c b/examples/simple.c
@@ -35,7 +35,7 @@ int main() {
     }
 
     // Encode image
-    float img_vec[vec_dim];
+    float *img_vec = (float*)malloc(vec_dim * sizeof(float));
     if (!clip_image_encode(ctx, n_threads, img_res, img_vec, true)) {
         fprintf(stderr, "%s: failed to encode image\n", __func__);
         return 1;
@@ -46,7 +46,7 @@ int main() {
     clip_tokenize(ctx, text, tokens);
 
     // Encode text
-    float txt_vec[vec_dim];
+    float *txt_vec= (float *)malloc(vec_dim * sizeof(float));
     if (!clip_text_encode(ctx, n_threads, tokens, txt_vec, true)) {
         fprintf(stderr, "%s: failed to encode text\n", __func__);
         return 1;
@@ -66,6 +66,8 @@ int main() {
 
     // Cleanup
     clip_free(ctx);
+    free(img_vec);
+    free(txt_vec);
 
     return 0;
 }
diff --git a/examples/zsl.cpp b/examples/zsl.cpp
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
         printf("%s: You must specify at least 2 texts for zero-shot labeling\n", __func__);
     }
 
-    const char * labels[n_labels];
+    const char ** labels = new const char *[n_labels];
     for (size_t i = 0; i < n_labels; ++i) {
         labels[i] = params.texts[i].c_str();
     }
@@ -34,8 +34,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    float sorted_scores[n_labels];
-    int sorted_indices[n_labels];
+    float *sorted_scores = new float[n_labels];
+    int *sorted_indices = new int[n_labels];
     if (!clip_zero_shot_label_image(ctx, params.n_threads, &input_img, labels, n_labels, sorted_scores, sorted_indices)) {
         fprintf(stderr, "Unable to apply ZSL\n");
         return 1;
@@ -48,6 +48,9 @@ int main(int argc, char ** argv) {
     }
 
     clip_free(ctx);
+    delete[] labels;
+    delete[] sorted_scores;
+    delete[] sorted_indices;
 
     return 0;
 }
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
@@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
 
     const int vec_dim = clip_get_text_hparams(ctx)->projection_dim;
 
-    float txt_vecs[n_labels * vec_dim];
+    float *txt_vecs = new float[n_labels * vec_dim];
 
     ggml_time_init();
 
@@ -79,11 +79,11 @@ int main(int argc, char ** argv) {
     int n_total_items = 0;         // total number of images processed
     float total_acc1_score = 0.0f; // total accuracy at 1 for the intire dataset
     float total_acc5_score = 0.0f; // total accuracy at 5 in intitre dataset
-    float img_vecs[vec_dim * batch_size];
+    float *img_vecs = new float[vec_dim * batch_size];
 
-    float similarities[n_labels];
-    float sorted_scores[n_labels];
-    int indices[n_labels];
+    float *similarities = new float[n_labels];
+    float *sorted_scores = new float[n_labels];
+    int *indices = new int[n_labels];
     std::vector<clip_image_u8> img_inputs(batch_size);
     std::vector<clip_image_f32> imgs_resized(batch_size);
 
@@ -167,6 +167,11 @@ int main(int argc, char ** argv) {
     }
 
     clip_free(ctx);
+    delete[] txt_vecs;
+    delete[] img_vecs;
+    delete[] similarities;
+    delete[] sorted_scores;
+    delete[] indices;
 
     return 0;
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,6 @@ models/*.bin @@
     __pycache__
     dist
     *.gguf
+    .vs
+    **/*.env