From 1140e0c3380db2bb5d6f6577a371d04a8cc3fcfb Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Sat, 17 Aug 2024 01:19:37 +0800 Subject: [PATCH] feat: add GGMLFileQuantizationType and apply to test (#806) @mishig25 that's it for #794 --------- Co-authored-by: Xuan Son Nguyen --- packages/gguf/src/gguf.spec.ts | 18 ++++++++------- packages/gguf/src/gguf.ts | 2 +- packages/gguf/src/types.ts | 41 +++++++++++++++++++++++++++++++++- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts index ca0eb602e..eb74fc5d6 100644 --- a/packages/gguf/src/gguf.spec.ts +++ b/packages/gguf/src/gguf.spec.ts @@ -1,6 +1,6 @@ import { beforeAll, describe, expect, it } from "vitest"; import type { GGUFParseOutput } from "./gguf"; -import { GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf"; +import { GGMLFileQuantizationType, GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf"; import fs from "node:fs"; const URL_LLAMA = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/191239b/llama-2-7b-chat.Q2_K.gguf"; @@ -21,9 +21,11 @@ describe("gguf", () => { if (!fs.existsSync(".cache")) { fs.mkdirSync(".cache"); } - const res = await fetch(URL_BIG_METADATA); - const arrayBuf = await res.arrayBuffer(); - fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf)); + if (!fs.existsSync(".cache/model.gguf")) { + const res = await fetch(URL_BIG_METADATA); + const arrayBuf = await res.arrayBuffer(); + fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf)); + } }); it("should parse a llama2 7b", async () => { @@ -37,7 +39,7 @@ describe("gguf", () => { tensor_count: 291n, kv_count: 19n, "general.architecture": "llama", - "general.file_type": 10, + "general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K, "general.name": "LLaMA v2", "general.quantization_version": 2, "llama.attention.head_count": 32, @@ -96,7 +98,7 @@ describe("gguf", () => { tensor_count: 291n, kv_count: 24n, "general.architecture": "llama", - "general.file_type": 17, + "general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M, "general.name": "mistralai_mistral-7b-instruct-v0.2", "general.quantization_version": 2, "llama.attention.head_count": 32, @@ -134,7 +136,7 @@ describe("gguf", () => { tensor_count: 164n, kv_count: 21n, "general.architecture": "gemma", - "general.file_type": GGMLQuantizationType.Q8_K, // 15 + "general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M, "general.name": "gemma-2b-it", "general.quantization_version": 2, "gemma.attention.head_count": 8, @@ -171,7 +173,7 @@ describe("gguf", () => { tensor_count: 197n, kv_count: 23n, "general.architecture": "bert", - "general.file_type": GGMLQuantizationType.F16, + "general.file_type": GGMLFileQuantizationType.MOSTLY_F16, "general.name": "bge-small-en-v1.5", "bert.attention.causal": false, "bert.attention.head_count": 12, diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts index 4d32567bf..945d5a494 100644 --- a/packages/gguf/src/gguf.ts +++ b/packages/gguf/src/gguf.ts @@ -4,7 +4,7 @@ import { isBackend } from "./utils/isBackend"; import { promisesQueue } from "./utils/promisesQueue"; export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types"; -export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types"; +export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types"; export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions"; export const RE_GGUF_FILE = /\.gguf$/; diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts index f2fbbcdfb..02872b95c 100644 --- a/packages/gguf/src/types.ts +++ b/packages/gguf/src/types.ts @@ -6,6 +6,45 @@ export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataVa export type Version = 1 | 2 | 3; +export enum GGMLFileQuantizationType { + MOSTLY_F32 = 0, + MOSTLY_F16 = 1, + MOSTLY_Q4_0 = 2, + MOSTLY_Q4_1 = 3, + MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // MOSTLY_Q4_2 = 5, // support has been removed + // MOSTLY_Q4_3 = 6, // support has been removed + MOSTLY_Q8_0 = 7, + MOSTLY_Q5_0 = 8, + MOSTLY_Q5_1 = 9, + MOSTLY_Q2_K = 10, + MOSTLY_Q3_K_S = 11, + MOSTLY_Q3_K_M = 12, + MOSTLY_Q3_K_L = 13, + MOSTLY_Q4_K_S = 14, + MOSTLY_Q4_K_M = 15, + MOSTLY_Q5_K_S = 16, + MOSTLY_Q5_K_M = 17, + MOSTLY_Q6_K = 18, + MOSTLY_IQ2_XXS = 19, + MOSTLY_IQ2_XS = 20, + MOSTLY_Q2_K_S = 21, + MOSTLY_IQ3_XS = 22, + MOSTLY_IQ3_XXS = 23, + MOSTLY_IQ1_S = 24, + MOSTLY_IQ4_NL = 25, + MOSTLY_IQ3_S = 26, + MOSTLY_IQ3_M = 27, + MOSTLY_IQ2_S = 28, + MOSTLY_IQ2_M = 29, + MOSTLY_IQ4_XS = 30, + MOSTLY_IQ1_M = 31, + MOSTLY_BF16 = 32, + MOSTLY_Q4_0_4_4 = 33, + MOSTLY_Q4_0_4_8 = 34, + MOSTLY_Q4_0_8_8 = 35, +} + export enum GGMLQuantizationType { F32 = 0, F16 = 1, @@ -60,7 +99,7 @@ export type Architecture = (typeof ARCHITECTURES)[number]; export interface GGUFGeneralInfo { "general.architecture": TArchitecture; "general.name"?: string; - "general.file_type"?: number; + "general.file_type"?: GGMLFileQuantizationType; "general.quantization_version"?: number; }