llama.cpp: add scripts for building and running

danbev · Aug 21, 2024 · 485dbb7 · 485dbb7
1 parent 2c58695
commit 485dbb7
Show file tree

Hide file tree

Showing 14 changed files with 96 additions and 0 deletions.
diff --git a/fundamentals/llama.cpp/scripts/build-configure.sh b/fundamentals/llama.cpp/scripts/build-configure.sh
@@ -0,0 +1,2 @@
+cmake -S . -B build -DLLAMA_CURL=ON
+cmake --build build
diff --git a/fundamentals/llama.cpp/scripts/build-cuda.sh b/fundamentals/llama.cpp/scripts/build-cuda.sh
@@ -0,0 +1,2 @@
+cmake -S . -B build -DGGML_CUDA=On -DCMAKE_BUILD_TYPE=Debug
+cmake --build build
diff --git a/fundamentals/llama.cpp/scripts/build-debug.sh b/fundamentals/llama.cpp/scripts/build-debug.sh
@@ -0,0 +1,2 @@
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug
+cmake --build build
diff --git a/fundamentals/llama.cpp/scripts/build-kompute.sh b/fundamentals/llama.cpp/scripts/build-kompute.sh
@@ -0,0 +1,2 @@
+cmake -S . -B build -DGGML_KOMPUTE=On
+cmake --build build
diff --git a/fundamentals/llama.cpp/scripts/build-rocm.sh b/fundamentals/llama.cpp/scripts/build-rocm.sh
@@ -0,0 +1,5 @@
+#cmake -B ./build -S . -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=/opt/rocm-6.1.2/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm-6.1.2/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx1030;gfx1100;gfx1101;gfx1102'
+
+cmake -B ./build -S . -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=/opt/rocm-6.1.2/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm-6.1.2/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx1030'
+
+cmake --build ./build --config Release -- -j8
diff --git a/fundamentals/llama.cpp/scripts/build-vulkan.sh b/fundamentals/llama.cpp/scripts/build-vulkan.sh
@@ -0,0 +1,2 @@
+cmake -S . -B build -DGGML_VULKAN=On
+cmake --build build
diff --git a/fundamentals/llama.cpp/scripts/convert-gemma2.sh b/fundamentals/llama.cpp/scripts/convert-gemma2.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+source venv/bin/activate
+
+model_name=gemma-2-9b-it
+model_dir=~/.cache/huggingface/hub/models--google--${model_name}/snapshots/93be03fbe3787f19bf03a4b1d3d75d36cb1f6ace
+
+#python convert-hf-to-gguf.py $model_dir --outfile=models/${model_name}.gguf --outtype f16 
+
+python convert-hf-to-gguf.py $model_dir --outfile=models/${model_name}.gguf --outtype f16 --split-max-tensors 100
+
+deactivate
diff --git a/fundamentals/llama.cpp/scripts/inspect-model.sh b/fundamentals/llama.cpp/scripts/inspect-model.sh
@@ -0,0 +1,3 @@
+source venv/bin/activate
+gguf-py/scripts/gguf_dump.py $1
+deactivate
diff --git a/fundamentals/llama.cpp/scripts/run-embeddings.sh b/fundamentals/llama.cpp/scripts/run-embeddings.sh
@@ -0,0 +1,4 @@
+#./llama-embedding -m models/llama-2-7b-chat.Q4_K_M.gguf --no-warmup --pooling mean  -p "What is LoRA?"
+#gdb --args ./llama-embedding -m models/llama-2-7b-chat.Q4_K_M.gguf --no-warmup --pooling mean  -p "What is LoRA?"
+#gdb --args ./llama-embedding -m models/llama-2-7b-chat.Q4_K_M.gguf --no-warmup --pooling last  -p "What is LoRA?"
+gdb --args ./llama-embedding -m models/llama-2-7b-chat.Q4_K_M.gguf --no-warmup --pooling cls  -p "What is LoRA?"
diff --git a/fundamentals/llama.cpp/scripts/run-main.sh b/fundamentals/llama.cpp/scripts/run-main.sh
@@ -0,0 +1,41 @@
+#./llama-cli -m models/tinyllama-1.1b-1t-openorca.Q2_K.gguf --prompt 'The answer to 1 + 1 is' -n 5 --verbose-prompt
+#gdb --args ./llama-cli -m models/tinyllama-1.1b-1t-openorca.Q2_K.gguf --prompt 'The answer to 1 + 1 is' -n 5 --verbose-prompt
+
+
+## Run with session file
+#gdb --args ./llama-cli -m models/tinyllama-1.1b-1t-openorca.Q2_K.gguf --prompt 'The answer to 1 + 1 is' -n 5 --verbose-prompt --prompt-cache main-session.txt
+#./llama-cli -m models/tinyllama-1.1b-1t-openorca.Q2_K.gguf --prompt 'The answer to 1 + 1 is' -n 5 --verbose-prompt --prompt-cache main-session.txt
+#gdb --args ./llama-cli -m models/tinyllama-1.1b-1t-openorca.Q2_K.gguf --prompt 'Hello world' -n 3 --verbose-prompt --prompt-cache main-session.txt --dump-kv-cache
+
+
+#gdb --args ./llama-cli -m models/tinyllama-1.1b-1t-openorca.Q2_K.gguf --prompt 'Hello world' -n 3 --verbose-prompt --dump-kv-cache
+
+#gdb --args ./llama-cli -m /home/danbev/.cache/lm-studio/models/TheBloke/phi-2-GGUF/phi-2.Q3_K_M.gguf --prompt 'The answer to 1 + 1 is' -n 5 --verbose-prompt --temp 0
+#./llama-cli -m /home/danbev/.cache/lm-studio/models/TheBloke/phi-2-GGUF/phi-2.Q3_K_M.gguf --prompt 'The answer to 1 + 1 is' -n 5 --verbose-prompt --temp 0 --top-p 0.950 --min-p 0.05 --repeat-penalty 1.1  --typical 1
+
+
+#./llama-cli -m /home/danbev/.cache/lm-studio/models/TheBloke/phi-2-GGUF/phi-2.Q3_K_M.gguf --color -i -r "User:" -f prompts/chat-with-bob.txt
+
+#./llama-cli -m models/gemma-2-9b.gguf -p "<start_of_turn>user
+#What is LoRA?<end_of_turn>
+#<start_of_turn>model"
+
+#./llama-cli -m models/gemma-2-9b.gguf -ngl 15 -p "Hi"
+#./llama-cli -m ~/.cache/lm-studio/models/bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_S.gguf -ngl 15 -p "<start_of_turn>user\nWhat is LoRA?<end_of_turn>\n<start_of_turn>model"
+#./llama-cli -m models/gemma-2-9b-it.gguf -ngl 15 -p "<start_of_turn>user\nWhat is LoRA?<end_of_turn>\n<start_of_turn>model"
+#./llama-cli -m models/gemma-2-9b-it.gguf -ngl 15 -p "What is LoRA?"
+#./llama-cli -m models/gemma-2-9b-it.gguf -ngl 15 -p "Dan loves icecream"
+#./llama-cli -m models/gemma-2-9b-it.gguf -dkvc -ngl 15 -p "Dan loves icecream"
+#gdb --args ./llama-cli -m models/gemma-2-9b-it.gguf --grp-attn-n 2 --grp-attn-w 4 -p "Dan loves icecream"
+
+
+#gdb --args ./llama-cli -m models/llama-2-7b.Q4_0.gguf --no-warmup --rope-scaling yarn --rope-freq-scale 1 --yarn-ext-factor 1.0 -ngl 10 -p "What is LoRA?" -n 10
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf --no-warmup --rope-scaling yarn --rope-freq-scale 1 --yarn-ext-factor 1.0 -ngl 10 -p "What is LoRA?" -n 10
+# Testing Self-Extend
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 --grp-attn-n 4 --grp-attn-w 32 -f pg1184.txt -c 16384 --temp 0
+# llama-2-7b.Q4_0.gguf was trained on a 4096 context size
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8192 --temp 0 -n 256 --grp-attn-n 4 --grp-attn-w 256
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8192 --temp 0 -n 256
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8192 --temp 0 -n 256
+
+./llama-cli -m models/mamba-gpt-7b-q4_0.gguf --no-warmup -ngl 10 -p "What is LoRA?" -n 10
diff --git a/fundamentals/llama.cpp/scripts/run-passkey.sh b/fundamentals/llama.cpp/scripts/run-passkey.sh
@@ -0,0 +1 @@
+make -j && ./llama-passkey -m ./models/llama-2-7b.Q4_0.gguf --junk 250
diff --git a/fundamentals/llama.cpp/scripts/run-self-extend.sh b/fundamentals/llama.cpp/scripts/run-self-extend.sh
@@ -0,0 +1,12 @@
+# Testing Self-Extend
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 --grp-attn-n 4 --grp-attn-w 32 -f pg1184.txt -c 16384 --temp 0
+# llama-2-7b.Q4_0.gguf was trained on a 4096 context size
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8192 --temp 0 -n 100 --grp-attn-n 4 --grp-attn-w 32
+#gdb --args ./llama-cli -m /home/danbev/.cache/lm-studio/models/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/tinyllama-1.1b-1t-openorca.Q2_K.gguf -ngl 10 -f self-extend.txt -c 4096 --temp 1 -n 100 --grp-attn-n 4 --grp-attn-w 512
+#gdb --args ./llama-cli -m /home/danbev/.cache/lm-studio/models/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/tinyllama-1.1b-1t-openorca.Q2_K.gguf -ngl 10 -f self-extend.txt -c 8000 --temp 1 -n 200 --grp-attn-n 4 --grp-attn-w 128
+#gdb --args ./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8000 --temp 1 -n 200 --grp-attn-n 2 --grp-attn-w 2048
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8000 --temp 1 -n 200 --grp-attn-n 128 --grp-attn-w 2048
+gdb --args ./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8000 --temp 1 --grp-attn-n 2 --grp-attn-w 2048
+#./llama-cli -m /home/danbev/.cache/lm-studio/models/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/tinyllama-1.1b-1t-openorca.Q2_K.gguf -ngl 10 -f self-extend.txt -c 5000 --temp 1 -n 100 --grp-attn-n 4 --grp-attn-w 32
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8192 --temp 0 -n 256
+#./llama-cli -m models/llama-2-7b.Q4_0.gguf -ngl 10 -f self-extend.txt -c 8192 --temp 0 -n 256
diff --git a/fundamentals/llama.cpp/scripts/run-tests.sh b/fundamentals/llama.cpp/scripts/run-tests.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo "Running tests target..."
+cmake --build build --target test --verbose
diff --git a/fundamentals/llama.cpp/scripts/run-tokenize.sh b/fundamentals/llama.cpp/scripts/run-tokenize.sh
@@ -0,0 +1,6 @@
+#./llama-tokenize -m models/llama-2-7b.Q4_0.gguf -f self-extend.txt --show-count
+
+#./llama-tokenize -m /home/danbev/.cache/lm-studio/models/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/tinyllama-1.1b-1t-openorca.Q2_K.gguf -f /home/danbev/work/lmstudio/llmster/electron/vendor/llm-engine/gtest/data/self-extend-test.txt --show-count
+
+./llama-tokenize -m /home/danbev/.cache/lm-studio/models/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/tinyllama-1.1b-1t-openorca.Q2_K.gguf -f self-extend.txt --show-count
+#./llama-tokenize -m models/llama-2-7b.Q4_0.gguf -f self-extend.txt --show-count
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		cmake -S . -B build -DLLAMA_CURL=ON
		cmake --build build
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		cmake -S . -B build -DGGML_CUDA=On -DCMAKE_BUILD_TYPE=Debug
		cmake --build build
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug
		cmake --build build
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		cmake -S . -B build -DGGML_KOMPUTE=On
		cmake --build build
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		cmake -S . -B build -DGGML_VULKAN=On
		cmake --build build
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		make -j && ./llama-passkey -m ./models/llama-2-7b.Q4_0.gguf --junk 250