22#define __CONDITIONER_HPP__
33
44#include " clip.hpp"
5+ #include " qwenvl.hpp"
56#include " t5.hpp"
67
78struct SDCondition {
@@ -22,11 +23,11 @@ struct Conditioner {
2223 int width,
2324 int height,
2425 int adm_in_channels = -1 ,
25- bool zero_out_masked = false ) = 0;
26- virtual void alloc_params_buffer () = 0;
27- virtual void free_params_buffer () = 0;
28- virtual void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) = 0;
29- virtual size_t get_params_buffer_size () = 0;
26+ bool zero_out_masked = false ) = 0;
27+ virtual void alloc_params_buffer () = 0;
28+ virtual void free_params_buffer () = 0;
29+ virtual void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) = 0;
30+ virtual size_t get_params_buffer_size () = 0;
3031 virtual std::tuple<SDCondition, std::vector<bool >> get_learned_condition_with_trigger (ggml_context* work_ctx,
3132 int n_threads,
3233 const std::string& text,
@@ -35,9 +36,13 @@ struct Conditioner {
3536 int height,
3637 int num_input_imgs,
3738 int adm_in_channels = -1 ,
38- bool zero_out_masked = false ) = 0;
39+ bool zero_out_masked = false ) {
40+ GGML_ABORT (" Not implemented yet!" );
41+ }
3942 virtual std::string remove_trigger_from_prompt (ggml_context* work_ctx,
40- const std::string& prompt) = 0;
43+ const std::string& prompt) {
44+ GGML_ABORT (" Not implemented yet!" );
45+ }
4146};
4247
4348// ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -978,23 +983,6 @@ struct SD3CLIPEmbedder : public Conditioner {
978983 auto tokens_and_weights = tokenize (text, 77 , true );
979984 return get_learned_condition_common (work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
980985 }
981-
982- std::tuple<SDCondition, std::vector<bool >> get_learned_condition_with_trigger (ggml_context* work_ctx,
983- int n_threads,
984- const std::string& text,
985- int clip_skip,
986- int width,
987- int height,
988- int num_input_imgs,
989- int adm_in_channels = -1 ,
990- bool zero_out_masked = false ) {
991- GGML_ASSERT (0 && " Not implemented yet!" );
992- }
993-
994- std::string remove_trigger_from_prompt (ggml_context* work_ctx,
995- const std::string& prompt) {
996- GGML_ASSERT (0 && " Not implemented yet!" );
997- }
998986};
999987
1000988struct FluxCLIPEmbedder : public Conditioner {
@@ -1195,23 +1183,6 @@ struct FluxCLIPEmbedder : public Conditioner {
11951183 auto tokens_and_weights = tokenize (text, chunk_len, true );
11961184 return get_learned_condition_common (work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
11971185 }
1198-
1199- std::tuple<SDCondition, std::vector<bool >> get_learned_condition_with_trigger (ggml_context* work_ctx,
1200- int n_threads,
1201- const std::string& text,
1202- int clip_skip,
1203- int width,
1204- int height,
1205- int num_input_imgs,
1206- int adm_in_channels = -1 ,
1207- bool zero_out_masked = false ) {
1208- GGML_ASSERT (0 && " Not implemented yet!" );
1209- }
1210-
1211- std::string remove_trigger_from_prompt (ggml_context* work_ctx,
1212- const std::string& prompt) {
1213- GGML_ASSERT (0 && " Not implemented yet!" );
1214- }
12151186};
12161187
12171188struct T5CLIPEmbedder : public Conditioner {
@@ -1398,22 +1369,135 @@ struct T5CLIPEmbedder : public Conditioner {
13981369 auto tokens_and_weights = tokenize (text, chunk_len, true );
13991370 return get_learned_condition_common (work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
14001371 }
1372+ };
14011373
1402- std::tuple<SDCondition, std::vector< bool >> get_learned_condition_with_trigger (ggml_context* work_ctx,
1403- int n_threads,
1404- const std::string& text,
1405- int clip_skip,
1406- int width,
1407- int height ,
1408- int num_input_imgs ,
1409- int adm_in_channels = - 1 ,
1410- bool zero_out_masked = false ) {
1411- GGML_ASSERT ( 0 && " Not implemented yet! " );
1374+ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
1375+ Qwen::Qwen2Tokenizer tokenizer;
1376+ std::shared_ptr<Qwen::Qwen2_5_VLRunner> qwenvl;
1377+ int prompt_template_encode_start_idx = 34 ;
1378+
1379+ Qwen2_5_VLCLIPEmbedder ( ggml_backend_t backend ,
1380+ bool offload_params_to_cpu ,
1381+ const String2GGMLType& tensor_types = {} ,
1382+ const std::string prefix = " " ) {
1383+ qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.qwen2vl " );
14121384 }
14131385
1414- std::string remove_trigger_from_prompt (ggml_context* work_ctx,
1415- const std::string& prompt) {
1416- GGML_ASSERT (0 && " Not implemented yet!" );
1386+ void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
1387+ qwenvl->get_param_tensors (tensors, " text_encoders.qwen2vl" );
1388+ }
1389+
1390+ void alloc_params_buffer () {
1391+ qwenvl->alloc_params_buffer ();
1392+ }
1393+
1394+ void free_params_buffer () {
1395+ qwenvl->free_params_buffer ();
1396+ }
1397+
1398+ size_t get_params_buffer_size () {
1399+ size_t buffer_size = 0 ;
1400+ buffer_size += qwenvl->get_params_buffer_size ();
1401+ return buffer_size;
1402+ }
1403+
1404+ std::tuple<std::vector<int >, std::vector<float >> tokenize (std::string text,
1405+ size_t max_length = 0 ,
1406+ bool padding = false ) {
1407+ auto parsed_attention = parse_prompt_attention (text);
1408+
1409+ {
1410+ std::stringstream ss;
1411+ ss << " [" ;
1412+ for (const auto & item : parsed_attention) {
1413+ ss << " ['" << item.first << " ', " << item.second << " ], " ;
1414+ }
1415+ ss << " ]" ;
1416+ LOG_DEBUG (" parse '%s' to %s" , text.c_str (), ss.str ().c_str ());
1417+ }
1418+
1419+ std::vector<int > tokens;
1420+ std::vector<float > weights;
1421+ for (const auto & item : parsed_attention) {
1422+ const std::string& curr_text = item.first ;
1423+ float curr_weight = item.second ;
1424+ std::vector<int > curr_tokens = tokenizer.tokenize (curr_text, nullptr );
1425+ tokens.insert (tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1426+ weights.insert (weights.end (), curr_tokens.size (), curr_weight);
1427+ }
1428+
1429+ tokenizer.pad_tokens (tokens, weights, max_length, padding);
1430+
1431+ // for (int i = 0; i < tokens.size(); i++) {
1432+ // std::cout << tokens[i] << ":" << weights[i] << ", ";
1433+ // }
1434+ // std::cout << std::endl;
1435+
1436+ return {tokens, weights};
1437+ }
1438+
1439+ SDCondition get_learned_condition_common (ggml_context* work_ctx,
1440+ int n_threads,
1441+ std::tuple<std::vector<int >, std::vector<float >> token_and_weights,
1442+ int clip_skip,
1443+ bool zero_out_masked = false ) {
1444+ auto & tokens = std::get<0 >(token_and_weights);
1445+ auto & weights = std::get<1 >(token_and_weights);
1446+
1447+ int64_t t0 = ggml_time_ms ();
1448+ struct ggml_tensor * hidden_states = NULL ; // [N, n_token, 3584]
1449+
1450+ auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, tokens);
1451+
1452+ qwenvl->compute (n_threads,
1453+ input_ids,
1454+ &hidden_states,
1455+ work_ctx);
1456+ {
1457+ auto tensor = hidden_states;
1458+ float original_mean = ggml_tensor_mean (tensor);
1459+ for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
1460+ for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
1461+ for (int i0 = 0 ; i0 < tensor->ne [0 ]; i0++) {
1462+ float value = ggml_tensor_get_f32 (tensor, i0, i1, i2);
1463+ value *= weights[i1];
1464+ ggml_tensor_set_f32 (tensor, value, i0, i1, i2);
1465+ }
1466+ }
1467+ }
1468+ float new_mean = ggml_tensor_mean (tensor);
1469+ ggml_tensor_scale (tensor, (original_mean / new_mean));
1470+ }
1471+
1472+ GGML_ASSERT (hidden_states->ne [1 ] > prompt_template_encode_start_idx);
1473+
1474+ ggml_tensor* new_hidden_states = ggml_new_tensor_3d (work_ctx,
1475+ GGML_TYPE_F32,
1476+ hidden_states->ne [0 ],
1477+ hidden_states->ne [1 ] - prompt_template_encode_start_idx,
1478+ hidden_states->ne [2 ]);
1479+
1480+ ggml_tensor_iter (new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
1481+ float value = ggml_tensor_get_f32 (hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
1482+ ggml_tensor_set_f32 (new_hidden_states, value, i0, i1, i2, i3);
1483+ });
1484+
1485+ int64_t t1 = ggml_time_ms ();
1486+ LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
1487+ return SDCondition (new_hidden_states, nullptr , nullptr );
1488+ }
1489+
1490+ SDCondition get_learned_condition (ggml_context* work_ctx,
1491+ int n_threads,
1492+ const std::string& text,
1493+ int clip_skip,
1494+ int width,
1495+ int height,
1496+ int adm_in_channels = -1 ,
1497+ bool zero_out_masked = false ) {
1498+ std::string prompt = " <|im_start|>system\n Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n <|im_start|>user\n " + text + " <|im_end|>\n <|im_start|>assistant\n " ;
1499+ auto tokens_and_weights = tokenize (prompt, 0 , false );
1500+ return get_learned_condition_common (work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
14171501 }
14181502};
14191503
0 commit comments