5
5
import torch
6
6
import numpy as np
7
7
from gguf import *
8
- from transformers import CLIPModel , CLIPProcessor
8
+ from transformers import CLIPModel , CLIPProcessor , CLIPVisionModel
9
9
10
10
TEXT = "clip.text"
11
11
VISION = "clip.vision"
@@ -78,11 +78,19 @@ def bytes_to_unicode():
78
78
help = "Save a text-only model. It can't be used to encode images" )
79
79
ap .add_argument ("--vision-only" , action = "store_true" , required = False ,
80
80
help = "Save a vision-only model. It can't be used to encode texts" )
81
+ ap .add_argument ("--clip_model_is_vision" , action = "store_true" , required = False ,
82
+ help = "The clip model is a pure vision model (ShareGPT4V vision extract for example)" )
81
83
ap .add_argument ("--llava-projector" , help = "Path to llava.projector file. If specified, save an image encoder for LLaVA models." )
82
84
ap .add_argument ("--image-mean" , nargs = 3 , type = float , required = False , help = "Override image mean values" )
83
85
ap .add_argument ("--image-std" , nargs = 3 , type = float , required = False , help = "Override image std values" )
84
86
ap .add_argument ("-o" , "--output-dir" , help = "Directory to save GGUF files. Default is the original model directory" , default = None )
87
+ # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
88
+ default_image_mean = [0.48145466 , 0.4578275 , 0.40821073 ]
89
+ default_image_std = [0.26862954 , 0.26130258 , 0.27577711 ]
90
+ ap .add_argument ('--image_mean' , type = float , nargs = '+' , help = 'Mean of the images for normalization (overrides processor) ' , default = None )
91
+ ap .add_argument ('--image_std' , type = float , nargs = '+' , help = 'Standard deviation of the images for normalization (overrides processor)' , default = None )
85
92
93
+ # with proper
86
94
args = ap .parse_args ()
87
95
88
96
@@ -96,15 +104,22 @@ def bytes_to_unicode():
96
104
# output in the same directory as the model if output_dir is None
97
105
dir_model = args .model_dir
98
106
99
-
100
- with open (dir_model + "/vocab.json" , "r" , encoding = "utf-8" ) as f :
101
- vocab = json .load (f )
102
- tokens = [key for key in vocab ]
107
+ if args .clip_model_is_vision :
108
+ vocab = None
109
+ tokens = None
110
+ else :
111
+ with open (dir_model + "/vocab.json" , "r" , encoding = "utf-8" ) as f :
112
+ vocab = json .load (f )
113
+ tokens = [key for key in vocab ]
103
114
104
115
with open (dir_model + "/config.json" , "r" , encoding = "utf-8" ) as f :
105
116
config = json .load (f )
106
- v_hparams = config ["vision_config" ]
107
- t_hparams = config ["text_config" ]
117
+ if args .clip_model_is_vision :
118
+ v_hparams = config
119
+ t_hparams = None
120
+ else :
121
+ v_hparams = config ["vision_config" ]
122
+ t_hparams = config ["text_config" ]
108
123
109
124
# possible data types
110
125
# ftype == 0 -> float32
@@ -117,9 +132,12 @@ def bytes_to_unicode():
117
132
if args .use_f32 :
118
133
ftype = 0
119
134
120
-
121
- model = CLIPModel .from_pretrained (dir_model )
122
- processor = CLIPProcessor .from_pretrained (dir_model )
135
+ if args .clip_model_is_vision :
136
+ model = CLIPVisionModel .from_pretrained (dir_model )
137
+ processor = None
138
+ else :
139
+ model = CLIPModel .from_pretrained (dir_model )
140
+ processor = CLIPProcessor .from_pretrained (dir_model )
123
141
124
142
fname_middle = None
125
143
has_text_encoder = True
@@ -128,13 +146,13 @@ def bytes_to_unicode():
128
146
if args .text_only :
129
147
fname_middle = "text-"
130
148
has_vision_encoder = False
131
- elif args .vision_only :
132
- fname_middle = "vision-"
133
- has_text_encoder = False
134
149
elif args .llava_projector is not None :
135
150
fname_middle = "mmproj-"
136
151
has_text_encoder = False
137
152
has_llava_projector = True
153
+ elif args .vision_only :
154
+ fname_middle = "vision-"
155
+ has_text_encoder = False
138
156
else :
139
157
fname_middle = ""
140
158
@@ -182,8 +200,12 @@ def bytes_to_unicode():
182
200
block_count = v_hparams ["num_hidden_layers" ] - 1 if has_llava_projector else v_hparams ["num_hidden_layers" ]
183
201
fout .add_uint32 (k (KEY_BLOCK_COUNT , VISION ), block_count )
184
202
185
- image_mean = processor .image_processor .image_mean if args .image_mean is None else args .image_mean
186
- image_std = processor .image_processor .image_std if args .image_std is None else args .image_std
203
+ if processor is not None :
204
+ image_mean = processor .image_processor .image_mean if args .image_mean is None or args .image_mean == default_image_mean else args .image_mean
205
+ image_std = processor .image_processor .image_std if args .image_std is None or args .image_std == default_image_std else args .image_std
206
+ else :
207
+ image_mean = args .image_mean if args .image_mean is not None else default_image_mean
208
+ image_std = args .image_std if args .image_std is not None else default_image_std
187
209
fout .add_array ("clip.vision.image_mean" , image_mean )
188
210
fout .add_array ("clip.vision.image_std" , image_std )
189
211
0 commit comments