ggml-org · cjpais · Mar 6, 2024 · Mar 6, 2024 · May 10, 2024
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1487,11 +1487,101 @@ struct server_context {
         }
     }
 
+    // for multiple images processing
+    bool ingest_images(server_slot &slot, int n_batch)
+    {
+        int image_idx = 0;
+        std::string prompt = "";
+
+        while (image_idx < (int) slot.images.size())
+        {
+            slot_image &img = slot.images[image_idx];
+
+            // process prefix prompt
+            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+            {
+                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+                llama_batch batch_view = {
+                    n_tokens,
+                    batch.token    + i,
+                    nullptr,
+                    batch.pos      + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id   + i,
+                    batch.logits   + i,
+                    0, 0, 0, // unused
+                };
+                if (llama_decode(ctx, batch_view))
+                {
+                    LOG_TEE("%s : failed to eval\n", __func__);
+                    return false;
+                }
+            }
+
+            // process image with llm
+            for (int i = 0; i < img.image_tokens; i += n_batch)
+            {
+                int n_eval = img.image_tokens - i;
+                if (n_eval > n_batch)
+                {
+                    n_eval = n_batch;
+                }
+
+                const int n_embd = llama_n_embd(model);
+                llama_batch batch_img = {
+                    n_eval,
+                    nullptr,
+                    (img.image_embedding + i * n_embd),
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    slot.n_past,
+                    1, 0
+                };
+                if (llama_decode(ctx, batch_img))
+                {
+                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    return false;
+                }
+                slot.n_past += n_eval;
+            }
+            image_idx++;
+
+            llama_batch_clear(batch);
+
+            // append prefix of next image
+            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
+                slot.params.input_suffix : // no more images, then process suffix prompt
+                (json)(slot.images[image_idx].prefix_prompt);
+
+            // rebuild the prompt since it was cleared earlier
+            prompt += img.prefix_prompt;
+            prompt += "[img-" + std::to_string(img.id) + "]";
+            prompt += json_prompt;
+
+            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
+            for (int i = 0; i < (int) append_tokens.size(); ++i)
+            {
+                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                slot.n_past += 1;
+            }
+        }
+
+        // There is no prompt caching in multimodal currently
+        slot.n_prompt_tokens = slot.n_past;
+        slot.n_prompt_tokens_processed = slot.n_past;
+
+        // prompt for multimodal is set to empty to avoid processing those tokens here
+        slot.prompt = prompt;
+
+        return true;
+    }
+
     void request_cancel(int id_task) {
         server_task task;
         task.type      = SERVER_TASK_TYPE_CANCEL;
         task.id_target = id_task;
-
         queue_tasks.post(task);
     }