diff --git a/packages/gguf/scripts/generate-llm.ts b/packages/gguf/scripts/generate-llm.ts index 4da57c46e..6ac73149a 100644 --- a/packages/gguf/scripts/generate-llm.ts +++ b/packages/gguf/scripts/generate-llm.ts @@ -8,27 +8,53 @@ import { writeFileSync } from "node:fs"; const SOURCE_CPP_URL = "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/llama.cpp"; const DEST_FILE_PATH = "./src/transformer-llm.ts"; const DEST_COMMON_SOURCE = ` -type Attention = - & { [K in \`\${TArchitecture}.attention.head_count\`]: number } - & { [K in \`\${TArchitecture}.attention.head_count_kv\`]: number } - & { [K in \`\${TArchitecture}.attention.layer_norm_epsilon\`]: number } - & { [K in \`\${TArchitecture}.attention.layer_norm_rms_epsilon\`]: number } - & { [K in \`\${TArchitecture}.attention.alibi_bias_max\`]: number } - & { [K in \`\${TArchitecture}.attention.clip_kqv\`]: number } - & { [K in \`\${TArchitecture}.attention.use_norm\`]: number }; - -type Rope = - & { [K in \`\${TArchitecture}.rope.dimension_count\`]: number } - & { [K in \`\${TArchitecture}.rope.freq_base\`]: number } - & { [K in \`\${TArchitecture}.rope.scale\`]: number } - & { [K in \`\${TArchitecture}.rope.scale_linear\`]: number }; - -type MOE = - & { [K in \`\${TArchitecture}.expert_count\`]: number } - & { [K in \`\${TArchitecture}.expert_used_count\`]: number }; +/** This file is auto-generated by generate-llm.ts */ + +import type { ModelBase, GGUFGeneralInfo } from "./types"; + +type LLMBase = Partial>; + +type Attention = Record< + \`\${TArchitecture}.attention.head_count\`, + number +> & Partial>; + +export type TransformerLLMRopeScalingType = "none" | "linear" | "yarn"; +type Rope = Partial< + Record< + \`\${TArchitecture}.rope.dimension_count\` + | \`\${TArchitecture}.rope.freq_base\` + | \`\${TArchitecture}.rope.scale_linear\` + | \`\${TArchitecture}.rope.scaling.factor\` + | \`\${TArchitecture}.rope.scaling.original_context_length\`, + number + > + & Record<\`\${TArchitecture}.rope.scaling.type\`, TransformerLLMRopeScalingType> + & Record<\`\${TArchitecture}.rope.finetuned\`, boolean> +>; + +type MOE = Partial< + Record< + \`\${TArchitecture}.expert_count\` + | \`\${TArchitecture}.expert_used_count\`, + number + > +>; export type TransformerLLMArchitecture = LLMArchitecture; // type alias -export type TransformerLLMBase = ModelBase +export type TransformerLLMBase = GGUFGeneralInfo + & LLMBase + & ModelBase & MOE & Attention & Rope; @@ -163,15 +189,11 @@ async function main() { ///////////////////////////////////// // write result to file const content = [ - "/** This file is auto-generated by generate-llm.ts */", - "", - 'import type { ModelBase } from "./types";', - "", + DEST_COMMON_SOURCE, "export const LLM_ARCHITECTURES = [", ...archList.map((a) => `\t${JSON.stringify(a.name)},`), "] as const;", "type LLMArchitecture = (typeof LLM_ARCHITECTURES)[number];", - DEST_COMMON_SOURCE, ...archList.map((a) => { let code = `export type ${a.tsName} = TransformerLLMBase<${JSON.stringify(a.name)}>`; if (a.hparams.length) { diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts index 2e6f2c21a..04ceda7ab 100644 --- a/packages/gguf/src/gguf.spec.ts +++ b/packages/gguf/src/gguf.spec.ts @@ -37,22 +37,25 @@ describe("gguf", () => { "llama.rope.dimension_count": 128, }); - const tokens = metadata["tokenizer.ggml.tokens"]; - if (!Array.isArray(tokens)) { - throw new Error(); + expect(metadata["tokenizer.ggml.model"]); + if (metadata["tokenizer.ggml.model"]) { + const tokens = metadata["tokenizer.ggml.tokens"]; + if (!Array.isArray(tokens)) { + throw new Error(); + } + expect(tokens.slice(0, 10)).toEqual([ + "", + "", + "", + "<0x00>", + "<0x01>", + "<0x02>", + "<0x03>", + "<0x04>", + "<0x05>", + "<0x06>", + ]); } - expect(tokens.slice(0, 10)).toEqual([ - "", - "", - "", - "<0x00>", - "<0x01>", - "<0x02>", - "<0x03>", - "<0x04>", - "<0x05>", - "<0x06>", - ]); /// Tensor infos /// By convention we test the first and last tensor. diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts index 8671593c0..c0aa92c3b 100644 --- a/packages/gguf/src/gguf.ts +++ b/packages/gguf/src/gguf.ts @@ -273,7 +273,7 @@ export async function gguf( offset += tensorCount.length; const numKv = readVersionedSize(r.view, offset, version, littleEndian); offset += numKv.length; - const metadata: GGUFMetadata = { + const metadata: GGUFMetadata<{ strict: false }> = { version, tensor_count: tensorCount.value, kv_count: numKv.value, diff --git a/packages/gguf/src/transformer-llm.ts b/packages/gguf/src/transformer-llm.ts index a9ef34296..8bad0261e 100644 --- a/packages/gguf/src/transformer-llm.ts +++ b/packages/gguf/src/transformer-llm.ts @@ -1,6 +1,56 @@ /** This file is auto-generated by generate-llm.ts */ -import type { ModelBase } from "./types"; +import type { ModelBase, GGUFGeneralInfo } from "./types"; + +type LLMBase = Partial< + Record< + `${TArchitecture}.vocab_size` | `${TArchitecture}.use_parallel_residual` | `${TArchitecture}.tensor_data_layout`, + number + > +>; + +type Attention = Record<`${TArchitecture}.attention.head_count`, number> & + Partial< + Record< + | `${TArchitecture}.attention.head_count_kv` + | `${TArchitecture}.attention.key_length` + | `${TArchitecture}.attention.value_length`, + number + > + >; + +export type TransformerLLMRopeScalingType = "none" | "linear" | "yarn"; +type Rope = Partial< + Record< + | `${TArchitecture}.rope.dimension_count` + | `${TArchitecture}.rope.freq_base` + | `${TArchitecture}.rope.scale_linear` + | `${TArchitecture}.rope.scaling.factor` + | `${TArchitecture}.rope.scaling.original_context_length`, + number + > & + Record<`${TArchitecture}.rope.scaling.type`, TransformerLLMRopeScalingType> & + Record<`${TArchitecture}.rope.finetuned`, boolean> +>; + +type MOE = Partial< + Record<`${TArchitecture}.expert_count` | `${TArchitecture}.expert_used_count`, number> +>; + +export type TransformerLLMArchitecture = LLMArchitecture; // type alias +export type TransformerLLMBase = GGUFGeneralInfo & + LLMBase & + ModelBase & + MOE & + Attention & + Rope; + +export enum TransformerLLMPoolingType { + UNSPECIFIED = -1, + NONE = 0, + MEAN = 1, + CLS = 2, +} export const LLM_ARCHITECTURES = [ "llama", @@ -37,36 +87,6 @@ export const LLM_ARCHITECTURES = [ "olmo", ] as const; type LLMArchitecture = (typeof LLM_ARCHITECTURES)[number]; - -type Attention = { [K in `${TArchitecture}.attention.head_count`]: number } & { - [K in `${TArchitecture}.attention.head_count_kv`]: number; -} & { [K in `${TArchitecture}.attention.layer_norm_epsilon`]: number } & { - [K in `${TArchitecture}.attention.layer_norm_rms_epsilon`]: number; -} & { [K in `${TArchitecture}.attention.alibi_bias_max`]: number } & { - [K in `${TArchitecture}.attention.clip_kqv`]: number; -} & { [K in `${TArchitecture}.attention.use_norm`]: number }; - -type Rope = { [K in `${TArchitecture}.rope.dimension_count`]: number } & { - [K in `${TArchitecture}.rope.freq_base`]: number; -} & { [K in `${TArchitecture}.rope.scale`]: number } & { [K in `${TArchitecture}.rope.scale_linear`]: number }; - -type MOE = { [K in `${TArchitecture}.expert_count`]: number } & { - [K in `${TArchitecture}.expert_used_count`]: number; -}; - -export type TransformerLLMArchitecture = LLMArchitecture; // type alias -export type TransformerLLMBase = ModelBase & - MOE & - Attention & - Rope; - -export enum TransformerLLMPoolingType { - UNSPECIFIED = -1, - NONE = 0, - MEAN = 1, - CLS = 2, -} - export type ArchLlama = TransformerLLMBase<"llama"> & { "llama.attention.layer_norm_rms_epsilon": number; }; diff --git a/packages/gguf/src/types.spec.ts b/packages/gguf/src/types.spec.ts new file mode 100644 index 000000000..9d20bfa8c --- /dev/null +++ b/packages/gguf/src/types.spec.ts @@ -0,0 +1,55 @@ +import { describe, it } from "vitest"; +import type { gguf } from "./gguf"; +import type { GGUFMetadata, GGUFParseOutput } from "./types"; + +describe("gguf-types", () => { + it("gguf() type can be casted between STRICT and NON_STRICT (at compile time)", async () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const result: Awaited> = { metadata: {} } as any; + const strictType = result as GGUFParseOutput<{ strict: true }>; + // @ts-expect-error because the key "abc" does not exist + strictType.metadata.abc = 123; + const nonStrictType = result as GGUFParseOutput<{ strict: false }>; + nonStrictType.metadata.abc = 123; // PASS, because it can be anything + // @ts-expect-error because ArrayBuffer is not a MetadataValue + nonStrictType.metadata.fff = ArrayBuffer; + }); + + it("GGUFType.NON_STRICT should be correct (at compile time)", async () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const model: GGUFMetadata<{ strict: false }> = {} as any; + model.kv_count = 123n; + model.abc = 456; // PASS, because it can be anything + }); + + it("GGUFType.STRICT should be correct (at compile time)", async () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const model: GGUFMetadata<{ strict: true }> = {} as any; + + if (model["general.architecture"] === "whisper") { + model["encoder.whisper.block_count"] = 0; + // @ts-expect-error because it must be a number + model["encoder.whisper.block_count"] = "abc"; + } + + if (model["tokenizer.ggml.model"] === undefined) { + // @ts-expect-error because it's undefined + model["tokenizer.ggml.eos_token_id"] = 1; + } + if (model["tokenizer.ggml.model"] === "gpt2") { + // @ts-expect-error because it must be a number + model["tokenizer.ggml.eos_token_id"] = undefined; + model["tokenizer.ggml.eos_token_id"] = 1; + } + + if (model["general.architecture"] === "mamba") { + model["mamba.ssm.conv_kernel"] = 0; + // @ts-expect-error because it must be a number + model["mamba.ssm.conv_kernel"] = "abc"; + } + if (model["general.architecture"] === "llama") { + // @ts-expect-error llama does not have ssm.* keys + model["mamba.ssm.conv_kernel"] = 0; + } + }); +}); diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts index 69405d66a..9e6f89dbf 100644 --- a/packages/gguf/src/types.ts +++ b/packages/gguf/src/types.ts @@ -50,11 +50,16 @@ export enum GGUFValueType { const ARCHITECTURES = [...LLM_ARCHITECTURES, "rwkv", "whisper"] as const; export type Architecture = (typeof ARCHITECTURES)[number]; -interface General { - "general.architecture": Architecture; - "general.name": string; - "general.file_type": number; - "general.quantization_version": number; +export interface GGUFGeneralInfo { + "general.architecture": TArchitecture; + "general.name"?: string; + "general.file_type"?: number; + "general.quantization_version"?: number; +} + +type ModelMetadata = Whisper | RWKV | TransformerLLM; +interface NoModelMetadata { + "general.architecture"?: undefined; } export type ModelBase< @@ -62,9 +67,15 @@ export type ModelBase< | Architecture | `encoder.${Extract}` | `decoder.${Extract}`, -> = { [K in `${TArchitecture}.layer_count`]: number } & { [K in `${TArchitecture}.feed_forward_length`]: number } & { - [K in `${TArchitecture}.context_length`]: number; -} & { [K in `${TArchitecture}.embedding_length`]: number } & { [K in `${TArchitecture}.block_count`]: number }; +> = Record< + | `${TArchitecture}.context_length` + | `${TArchitecture}.block_count` + | `${TArchitecture}.embedding_length` + | `${TArchitecture}.feed_forward_length`, + number +>; + +/// Tokenizer type TokenizerModel = "no_vocab" | "llama" | "gpt2" | "bert"; interface Tokenizer { @@ -75,21 +86,47 @@ interface Tokenizer { "tokenizer.ggml.bos_token_id": number; "tokenizer.ggml.eos_token_id": number; "tokenizer.ggml.add_bos_token": boolean; - "tokenizer.chat_template": string; + "tokenizer.chat_template"?: string; } +interface NoTokenizer { + "tokenizer.ggml.model"?: undefined; +} + +/// Models outside of llama.cpp: "rwkv" and "whisper" -export type RWKV = ModelBase<"rwkv"> & { "rwkv.architecture_version": number }; -export type LLM = TransformerLLM | RWKV; -export type Whisper = ModelBase<"encoder.whisper"> & ModelBase<"decoder.whisper">; -export type Model = (LLM | Whisper) & Partial; +export type RWKV = GGUFGeneralInfo<"rwkv"> & + ModelBase<"rwkv"> & { + "rwkv.architecture_version": number; + }; -export type GGUFMetadata = { +// TODO: whisper.cpp doesn't yet support gguf. This maybe changed in the future. +export type Whisper = GGUFGeneralInfo<"whisper"> & + ModelBase<"encoder.whisper"> & + ModelBase<"decoder.whisper"> & { + "whisper.encoder.mels_count": number; + "whisper.encoder.attention.head_count": number; + "whisper.decoder.attention.head_count": number; + }; + +/// Types for parse output + +export interface GGUFMetadataOptions { + /** + * Enable strict type for known GGUF fields. + * + * @default true + */ + strict: boolean; +} + +export type GGUFMetadata = { version: Version; tensor_count: bigint; kv_count: bigint; -} & Partial & - Partial & - Record; +} & GGUFModelKV & + (Options extends { strict: true } ? unknown : Record); + +export type GGUFModelKV = (NoModelMetadata | ModelMetadata) & (NoTokenizer | Tokenizer); export interface GGUFTensorInfo { name: string; @@ -99,7 +136,7 @@ export interface GGUFTensorInfo { offset: bigint; } -export interface GGUFParseOutput { - metadata: GGUFMetadata; +export interface GGUFParseOutput { + metadata: GGUFMetadata; tensorInfos: GGUFTensorInfo[]; }