|
| 1 | +#include "ggml.h" |
| 2 | +#include "llama.h" |
| 3 | + |
| 4 | +#include <algorithm> |
| 5 | +#include <cassert> |
| 6 | +#include <cinttypes> |
| 7 | +#include <cmath> |
| 8 | +#include <cstdio> |
| 9 | +#include <cstring> |
| 10 | +#include <map> |
| 11 | +#include <numeric> |
| 12 | +#include <regex> |
| 13 | +#include <string> |
| 14 | +#include <unordered_map> |
| 15 | +#include <vector> |
| 16 | + |
| 17 | +static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; |
| 18 | +static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); |
| 19 | + |
| 20 | +struct quantize_stats_params { |
| 21 | + std::string model = "models/7B/ggml-model-f16.bin"; |
| 22 | + bool verbose = false; |
| 23 | + bool per_layer_stats = false; |
| 24 | + bool print_histogram = false; |
| 25 | + bool reference = false; |
| 26 | + std::vector<std::string> include_layers; |
| 27 | + std::vector<std::string> exclude_layers; |
| 28 | + std::vector<enum ggml_type> include_types; |
| 29 | +}; |
| 30 | + |
| 31 | +const int64_t SCRATCH_ELEMENTS = 32*32; |
| 32 | +const size_t HISTOGRAM_BUCKETS = 150; |
| 33 | +const double HISTOGRAM_RANGE = 0.03; |
| 34 | + |
| 35 | +struct error_stats { |
| 36 | + size_t num_samples; |
| 37 | + double total_error; |
| 38 | + double max_error; |
| 39 | + uint64_t error_histogram[HISTOGRAM_BUCKETS]; |
| 40 | +}; |
| 41 | + |
| 42 | + |
| 43 | +void quantize_stats_print_usage(int /*argc*/, char ** argv) { |
| 44 | + quantize_stats_params params; |
| 45 | + fprintf(stderr, "usage: %s [options]\n", argv[0]); |
| 46 | + fprintf(stderr, "\n"); |
| 47 | + fprintf(stderr, "options:\n"); |
| 48 | + fprintf(stderr, " -h, --help show this help message and exit\n"); |
| 49 | + fprintf(stderr, " -m FNAME, --model FNAME\n"); |
| 50 | + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); |
| 51 | + fprintf(stderr, " -r, --reference\n"); |
| 52 | + fprintf(stderr, " use reference implementation (default: false)\n"); |
| 53 | + fprintf(stderr, " -v, --verbose\n"); |
| 54 | + fprintf(stderr, " verbose output (default: false)\n"); |
| 55 | + fprintf(stderr, " -p, --per-layer-stats\n"); |
| 56 | + fprintf(stderr, " print stats per layer (default: false)\n"); |
| 57 | + fprintf(stderr, " --histogram\n"); |
| 58 | + fprintf(stderr, " print error histogram (default: false)\n"); |
| 59 | + fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); |
| 60 | + fprintf(stderr, " only test layers matching pattern\n"); |
| 61 | + fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); |
| 62 | + fprintf(stderr, " exclude layers matching pattern\n"); |
| 63 | + fprintf(stderr, " -t TYPE, --type TYPE\n"); |
| 64 | + fprintf(stderr, " only test given type (q4_0, q4_1)\n"); |
| 65 | + fprintf(stderr, "\n"); |
| 66 | +} |
| 67 | + |
| 68 | +// Check if a layer is included/excluded by command line |
| 69 | +bool layer_included(const quantize_stats_params params, const std::string & layer) { |
| 70 | + for (const auto& excluded : params.exclude_layers) { |
| 71 | + if (std::regex_search(layer, std::regex(excluded))) { |
| 72 | + return false; |
| 73 | + } |
| 74 | + } |
| 75 | + for (const auto& included : params.include_layers) { |
| 76 | + if (std::regex_search(layer, std::regex(included))) { |
| 77 | + return true; |
| 78 | + } |
| 79 | + } |
| 80 | + return params.include_layers.empty(); |
| 81 | +} |
| 82 | + |
| 83 | +// Update error statistics given vectors with the before/after result of quantization |
| 84 | +void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { |
| 85 | + for (int64_t i = 0; i < nelements; i++) { |
| 86 | + double diff = input[i] - output[i]; |
| 87 | + stats.total_error += diff * diff; |
| 88 | + stats.max_error = fmax(fabs(diff), stats.max_error); |
| 89 | + stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; |
| 90 | + } |
| 91 | + stats.num_samples += nelements; |
| 92 | +} |
| 93 | + |
| 94 | +double find_quantile(const error_stats & stats, double quantile) { |
| 95 | + double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); |
| 96 | + |
| 97 | + double accum = 0; |
| 98 | + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { |
| 99 | + accum += stats.error_histogram[i]; |
| 100 | + if (accum >= sum*quantile) { |
| 101 | + return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; |
| 102 | + } |
| 103 | + } |
| 104 | + return INFINITY; |
| 105 | +} |
| 106 | + |
| 107 | +void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { |
| 108 | + double rmse = sqrt(stats.total_error / (double) stats.num_samples); |
| 109 | + double median = find_quantile(stats, .5); |
| 110 | + double pct95 = find_quantile(stats, .95); |
| 111 | + printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); |
| 112 | + if (print_histogram) { |
| 113 | + printf("Error distribution:\n"); |
| 114 | + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { |
| 115 | + double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; |
| 116 | + double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; |
| 117 | + if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; |
| 118 | + printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); |
| 119 | + } |
| 120 | + } |
| 121 | +} |
| 122 | + |
| 123 | +// copied from ggml.h - verify that we can access this as a flat array |
| 124 | +static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { |
| 125 | + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); |
| 126 | + |
| 127 | + return |
| 128 | + tensor->nb[0] == ggml_type_size(tensor->type) && |
| 129 | + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && |
| 130 | + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && |
| 131 | + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; |
| 132 | +} |
| 133 | + |
| 134 | +// Run quantization function for a single layer and update error stats |
| 135 | +void test_roundtrip_on_layer( |
| 136 | + std::string & name, |
| 137 | + bool print_layer_stats, |
| 138 | + const quantize_fns_t & qfns, |
| 139 | + bool use_reference, |
| 140 | + const ggml_tensor * layer, |
| 141 | + float * input_scratch, |
| 142 | + char *quantized_scratch, |
| 143 | + float * output_scratch, |
| 144 | + error_stats & total_error) { |
| 145 | + |
| 146 | + assert(tensor_is_contiguous(layer)); |
| 147 | + error_stats layer_error {}; |
| 148 | + int64_t nelements = ggml_nelements(layer); |
| 149 | + |
| 150 | + for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { |
| 151 | + int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); |
| 152 | + |
| 153 | + if (layer->type == GGML_TYPE_F16) { |
| 154 | + for (int i = 0; i < chunk_size; i++) { |
| 155 | + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); |
| 156 | + } |
| 157 | + } else { |
| 158 | + input_scratch = ggml_get_data_f32(layer) + offset; |
| 159 | + } |
| 160 | + |
| 161 | + if (use_reference) { |
| 162 | + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); |
| 163 | + } else { |
| 164 | + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); |
| 165 | + } |
| 166 | + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); |
| 167 | + |
| 168 | + update_error_stats(chunk_size, input_scratch, output_scratch, total_error); |
| 169 | + if (print_layer_stats) { |
| 170 | + update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); |
| 171 | + } |
| 172 | + } |
| 173 | + if (print_layer_stats) { |
| 174 | + print_error_stats(name, layer_error, false); |
| 175 | + } |
| 176 | +} |
| 177 | + |
| 178 | +int main(int argc, char ** argv) { |
| 179 | + ggml_time_init(); |
| 180 | + |
| 181 | + quantize_stats_params params; |
| 182 | + |
| 183 | + // read command line |
| 184 | + |
| 185 | + bool invalid_param = false; |
| 186 | + std::string arg; |
| 187 | + for (int i = 1; i < argc; i++) { |
| 188 | + arg = argv[i]; |
| 189 | + |
| 190 | + if (arg == "-h" || arg == "--help") { |
| 191 | + quantize_stats_print_usage(argc, argv); |
| 192 | + exit(0); |
| 193 | + } else if (arg == "-r" || arg == "--reference") { |
| 194 | + params.reference = true; |
| 195 | + } else if (arg == "-v") { |
| 196 | + params.verbose = true; |
| 197 | + } else if (arg == "-p" || arg == "--per-layer-stats") { |
| 198 | + params.per_layer_stats = true; |
| 199 | + } else if (arg == "--histogram") { |
| 200 | + params.print_histogram = true; |
| 201 | + } else if (arg == "-m" || arg == "--model") { |
| 202 | + if (++i >= argc) { |
| 203 | + invalid_param = true; |
| 204 | + break; |
| 205 | + } |
| 206 | + params.model = argv[i]; |
| 207 | + } else if (arg == "-l" || arg == "--include-layer") { |
| 208 | + if (++i >= argc) { |
| 209 | + invalid_param = true; |
| 210 | + break; |
| 211 | + } |
| 212 | + params.include_layers.push_back(argv[i]); |
| 213 | + } else if (arg == "-L" || arg == "--exclude-layer") { |
| 214 | + if (++i >= argc) { |
| 215 | + invalid_param = true; |
| 216 | + break; |
| 217 | + } |
| 218 | + params.exclude_layers.push_back(argv[i]); |
| 219 | + } else if (arg == "-t" || arg == "--type") { |
| 220 | + if (++i >= argc) { |
| 221 | + invalid_param = true; |
| 222 | + break; |
| 223 | + } |
| 224 | + int j; |
| 225 | + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { |
| 226 | + // find match |
| 227 | + } |
| 228 | + if (j < GGML_TYPE_COUNT) { |
| 229 | + params.include_types.push_back((ggml_type) j); |
| 230 | + } else { |
| 231 | + fprintf(stderr, "error: %s not in list of types\n", argv[i]); |
| 232 | + invalid_param = true; |
| 233 | + } |
| 234 | + } else { |
| 235 | + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); |
| 236 | + quantize_stats_print_usage(argc, argv); |
| 237 | + return 1; |
| 238 | + } |
| 239 | + } |
| 240 | + if (invalid_param) { |
| 241 | + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); |
| 242 | + quantize_stats_print_usage(argc, argv); |
| 243 | + return 1; |
| 244 | + } |
| 245 | + |
| 246 | + // load the model |
| 247 | + fprintf(stderr, "Loading model\n"); |
| 248 | + |
| 249 | + const int64_t t_main_start_us = ggml_time_us(); |
| 250 | + llama_context * ctx; |
| 251 | + |
| 252 | + { |
| 253 | + auto lparams = llama_context_default_params(); |
| 254 | + |
| 255 | + lparams.n_ctx = 256; |
| 256 | + lparams.n_parts = 1; |
| 257 | + lparams.seed = 1; |
| 258 | + lparams.f16_kv = false; |
| 259 | + lparams.use_mlock = false; |
| 260 | + |
| 261 | + ctx = llama_init_from_file(params.model.c_str(), lparams); |
| 262 | + |
| 263 | + if (ctx == NULL) { |
| 264 | + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); |
| 265 | + return 1; |
| 266 | + } |
| 267 | + } |
| 268 | + |
| 269 | + // Sort tensors for consistent output |
| 270 | + const auto tensors = llama_internal_get_tensor_map(ctx); |
| 271 | + std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() }; |
| 272 | + |
| 273 | + // check layer tensors |
| 274 | + int included_layers = 0; |
| 275 | + int64_t max_nelements = 0; |
| 276 | + bool is_f16 = false; |
| 277 | + for (const auto& kv_tensor : tensors_sorted) { |
| 278 | + if (!layer_included(params, kv_tensor.first)) { |
| 279 | + continue; |
| 280 | + } |
| 281 | + if (params.verbose) { |
| 282 | + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); |
| 283 | + } |
| 284 | + if (kv_tensor.second->type == GGML_TYPE_F16) { |
| 285 | + is_f16 = true; |
| 286 | + } else if (kv_tensor.second->type != GGML_TYPE_F32) { |
| 287 | + fprintf(stderr, "%s: error: Quantization should be tested with a float model, " |
| 288 | + "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); |
| 289 | + llama_free(ctx); |
| 290 | + return 1; |
| 291 | + } |
| 292 | + included_layers++; |
| 293 | + max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); |
| 294 | + } |
| 295 | + |
| 296 | + if (is_f16) { |
| 297 | + printf("note: source model is f16\n"); |
| 298 | + } |
| 299 | + printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); |
| 300 | + // allocate scratch space |
| 301 | + std::vector<float> input_scratch(SCRATCH_ELEMENTS); |
| 302 | + std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4); |
| 303 | + std::vector<float> output_scratch(SCRATCH_ELEMENTS); |
| 304 | + |
| 305 | + // loop throught quantization types |
| 306 | + for (int i = 0; i < GGML_TYPE_COUNT; i++) { |
| 307 | + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { |
| 308 | + continue; |
| 309 | + } |
| 310 | + quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); |
| 311 | + if (qfns.quantize_row_q && qfns.dequantize_row_q) { |
| 312 | + if (params.verbose) { |
| 313 | + printf("testing %s ...\n", type_strs[i]); |
| 314 | + } |
| 315 | + |
| 316 | + error_stats global_stats {}; |
| 317 | + |
| 318 | + for (const auto& kv_tensor : tensors_sorted) { |
| 319 | + if (!layer_included(params, kv_tensor.first)) { |
| 320 | + continue; |
| 321 | + } |
| 322 | + if (params.verbose) { |
| 323 | + printf(" %s ...\n", kv_tensor.first.c_str()); |
| 324 | + } |
| 325 | + std::string layer_name { type_strs[i] }; |
| 326 | + layer_name += "::" + kv_tensor.first; |
| 327 | + test_roundtrip_on_layer( |
| 328 | + layer_name, |
| 329 | + params.per_layer_stats, |
| 330 | + qfns, |
| 331 | + params.reference, |
| 332 | + kv_tensor.second, |
| 333 | + input_scratch.data(), |
| 334 | + quantized_scratch.data(), |
| 335 | + output_scratch.data(), |
| 336 | + global_stats |
| 337 | + ); |
| 338 | + } |
| 339 | + |
| 340 | + print_error_stats(type_strs[i], global_stats, params.print_histogram); |
| 341 | + } |
| 342 | + } |
| 343 | + |
| 344 | + |
| 345 | + llama_free(ctx); |
| 346 | + // report timing |
| 347 | + { |
| 348 | + const int64_t t_main_end_us = ggml_time_us(); |
| 349 | + |
| 350 | + printf("\n"); |
| 351 | + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); |
| 352 | + } |
| 353 | + |
| 354 | + return 0; |
| 355 | +} |
0 commit comments