Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Checks: >
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
-readability-braces-around-statements,
-readability-isolate-declaration,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
Expand Down
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2768,6 +2768,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MTMD}));
add_opt(common_arg(
{"--video"}, "PATH",
"path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
[](common_params & params, const std::string & value) {
params.video.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MTMD}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ struct common_params {
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
std::vector<std::string> video; // path to video file(s) or frame directories
int image_min_tokens = -1;
int image_max_tokens = -1;

Expand Down
20 changes: 20 additions & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
mtmd-video.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
mtmd-helper.cpp
mtmd-helper.h
mtmd-video.h
)

target_link_libraries (mtmd PUBLIC ggml llama)
Expand All @@ -20,6 +22,23 @@ target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../vendor)
target_compile_features (mtmd PRIVATE cxx_std_17)

# Optional FFmpeg support for video decoding
option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
if (MTMD_WITH_FFMPEG)
find_package(PkgConfig QUIET)
if (PKG_CONFIG_FOUND)
pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
if (FFMPEG_FOUND)
target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
else()
message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
endif()
else()
message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
endif()
endif()

if (BUILD_SHARED_LIBS)
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
Expand All @@ -29,6 +48,7 @@ endif()
set(MTMD_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
)

set_target_properties(mtmd
Expand Down
110 changes: 58 additions & 52 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ struct clip_hparams {
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int minicpmv_max_slice_nums = 9;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Search for max_slice_nums, it's already present in the code base.

int32_t minicpmv_query_num = 0; // MiniCPM-V query number

// custom value provided by user, can be undefined if not set
Expand Down Expand Up @@ -3911,16 +3912,67 @@ struct llava_uhd {
const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();

if (!has_slices) {
// skip slicing logic
res.overview_size = clip_image_size{slice_size, slice_size};
res.refined_size = clip_image_size{0, 0};
res.grid_size = clip_image_size{0, 0};
if (clip_is_minicpmv(ctx)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the change here compared to the initial version?

IMO I think this is more like a duplicated logic, see clip_image_preprocess on how this is used by minicpm-v

auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
res.overview_size = best_size;

{
const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
const float log_ratio = log((float)original_width / original_height);
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
const int multiple = fmin(ceil(ratio), max_slice_nums);

auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
res.grid_size = best_grid;
res.refined_size = refine_size;

LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
__func__, original_width, original_height,
res.overview_size.width, res.overview_size.height,
res.refined_size.width, res.refined_size.height,
res.grid_size.width, res.grid_size.height);

if (!has_slices || max_slice_nums == 0) {
return res;
}

int width = refine_size.width;
int height = refine_size.height;
int grid_x = int(width / best_grid.width);
int grid_y = int(height / best_grid.height);
for (int patches_y = 0, ic = 0;
patches_y < refine_size.height && ic < best_grid.height;
patches_y += grid_y, ic += 1) {
for (int patches_x = 0, jc = 0;
patches_x < refine_size.width && jc < best_grid.width;
patches_x += grid_x, jc += 1) {
slice_coordinates slice;
slice.x = patches_x;
slice.y = patches_y;
slice.size.width = grid_x;
slice.size.height = grid_y;
res.slices.push_back(slice);
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
__func__, (int)res.slices.size() - 1,
slice.x, slice.y, slice.size.width, slice.size.height);
}
}
}

return res;
}
else {
if (!has_slices) {
// skip slicing logic
res.overview_size = clip_image_size{slice_size, slice_size};
res.refined_size = clip_image_size{0, 0};
res.grid_size = clip_image_size{0, 0};

return res;
}

if (has_pinpoints) {
if (has_pinpoints) {
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
auto refine_size = llava_uhd::select_best_resolution(
original_size,
Expand Down Expand Up @@ -3956,53 +4008,7 @@ struct llava_uhd {

return res;
}

// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)

auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
res.overview_size = best_size;

{
const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
const float log_ratio = log((float)original_width / original_height);
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
const int multiple = fmin(ceil(ratio), max_slice_nums);

auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
res.grid_size = best_grid;
res.refined_size = refine_size;

LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
__func__, original_width, original_height,
res.overview_size.width, res.overview_size.height,
res.refined_size.width, res.refined_size.height,
res.grid_size.width, res.grid_size.height);

int width = refine_size.width;
int height = refine_size.height;
int grid_x = int(width / best_grid.width);
int grid_y = int(height / best_grid.height);
for (int patches_y = 0, ic = 0;
patches_y < refine_size.height && ic < best_grid.height;
patches_y += grid_y, ic += 1) {
for (int patches_x = 0, jc = 0;
patches_x < refine_size.width && jc < best_grid.width;
patches_x += grid_x, jc += 1) {
slice_coordinates slice;
slice.x = patches_x;
slice.y = patches_y;
slice.size.width = grid_x;
slice.size.height = grid_y;
res.slices.push_back(slice);
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
__func__, (int)res.slices.size() - 1,
slice.x, slice.y, slice.size.width, slice.size.height);
}
}
}

return res;
}

static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
Expand Down
48 changes: 35 additions & 13 deletions tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
#include "ggml.h"
#include "console.h"
#include "chat.h"
#include "clip.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include "mtmd-video.h"

#include <vector>
#include <limits.h>
#include <cinttypes>
#include <cstdlib>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
Expand Down Expand Up @@ -157,8 +160,8 @@ struct mtmd_cli_context {
);
}

bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
bool load_media(const std::string & path) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), path.c_str()));
if (!bmp.ptr) {
return false;
}
Expand Down Expand Up @@ -287,7 +290,7 @@ int main(int argc, char ** argv) {
mtmd_cli_context ctx(params);
LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());

bool is_single_turn = !params.prompt.empty() && !params.image.empty();
bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());

int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

Expand All @@ -311,19 +314,34 @@ int main(int argc, char ** argv) {

if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < params.image.size(); i++) {
params.prompt += mtmd_default_marker();
}
}
common_chat_msg msg;
msg.role = "user";
msg.content = params.prompt;

// 1) load all media first
size_t n_loaded_media = 0;
for (const auto & image : params.image) {
if (!ctx.load_media(image)) {
return 1; // error is already printed by libmtmd
}
n_loaded_media += 1;
}
for (const auto & vpath : params.video) {
if (!ctx.load_media(vpath)) {
return 1; // error is already printed by libmtmd
}
n_loaded_media += 1;
}

// 2) build prompt content with correct number of markers
std::string prompt_content = params.prompt;
if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < n_loaded_media; i++) {
prompt_content += mtmd_default_marker();
}
}

// 3) run
common_chat_msg msg;
msg.role = "user";
msg.content = prompt_content;
if (eval_message(ctx, msg)) {
return 1;
}
Expand All @@ -339,6 +357,9 @@ int main(int argc, char ** argv) {
if (mtmd_support_audio(ctx.ctx_vision.get())) {
LOG("\n /audio <path> load an audio");
}
if (mtmd_support_vision(ctx.ctx_vision.get())) {
LOG("\n /video <path> load a video");
}
LOG("\n /clear clear the chat history");
LOG("\n /quit or /exit exit the program");
LOG("\n");
Expand Down Expand Up @@ -370,14 +391,15 @@ int main(int argc, char ** argv) {
g_is_generating = true;
bool is_image = line == "/image" || line.find("/image ") == 0;
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
if (is_image || is_audio) {
bool is_video = line == "/video" || line.find("/video ") == 0;
if (is_image || is_audio || is_video) {
if (line.size() < 8) {
LOG_ERR("ERR: Missing media filename\n");
continue;
}
std::string media_path = line.substr(7);
if (ctx.load_media(media_path)) {
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video"));
content += mtmd_default_marker();
}
// else, error is already printed by libmtmd
Expand Down
35 changes: 27 additions & 8 deletions tools/mtmd/mtmd-helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
# define NOMINMAX
#endif
#include <windows.h>
#else
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#endif

#include "mtmd.h"
#include "mtmd-helper.h"
#include "llama.h"

#include "mtmd-video.h"

#include <algorithm>
#include <cinttypes>
#include <vector>
Expand Down Expand Up @@ -421,6 +427,10 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
}

if(mtmd_video::is_video_buffer(buf, len)) {
return mtmd_video::init_video_bitmap(ctx, buf, len);
}

// otherwise, we assume it's an image
mtmd_bitmap * result = nullptr;
{
Expand All @@ -436,25 +446,34 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
return result;
}

mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path) {
// Attention! A directory containing frames images is also considered a video
// so path which is a directory should be handled by mtmd_video::init_video_bitmap
// Besides, although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf,
// but for video files, it's better to let ffmpeg read from file
if(mtmd_video::is_video_file(path)){
return mtmd_video::init_video_bitmap(ctx, path);
}

FILE * f = fopen(path, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
LOG_ERR("Unable to open path %s: %s\n", path, strerror(errno));
return nullptr;
}

fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
auto * buf = new unsigned char[file_size];

size_t n_read = fread(buf.data(), 1, file_size, f);
size_t n_read = fread(buf, 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
LOG_ERR("Failed to read entire path %s", path);
return nullptr;
}

return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
auto * res = mtmd_helper_bitmap_init_from_buf(ctx, buf, file_size);
delete [] buf;
return res;
}
Loading