Skip to content

Commit 7b1e792

Browse files
authored
deps(llama.cpp): bump to latest, update build variables (#2669)
* arrow_up: Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto <[email protected]> * deps(llama.cpp): update build variables to follow upstream Update build recipes with ggml-org/llama.cpp#8006 Signed-off-by: Ettore Di Giacinto <[email protected]> * Disable shared libs by default in llama.cpp Signed-off-by: Ettore Di Giacinto <[email protected]> * Disable shared libs in llama.cpp Makefile Signed-off-by: Ettore Di Giacinto <[email protected]> * Disable metal embedding for now, until it is tested Signed-off-by: Ettore Di Giacinto <[email protected]> * fix(mac): explicitly enable metal Signed-off-by: Ettore Di Giacinto <[email protected]> * debug Signed-off-by: Ettore Di Giacinto <[email protected]> * fix typo Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto <[email protected]> Co-authored-by: mudler <[email protected]>
1 parent 30b883a commit 7b1e792

File tree

9 files changed

+39
-32
lines changed

9 files changed

+39
-32
lines changed

Diff for: .github/workflows/test.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ jobs:
220220
export CPLUS_INCLUDE_PATH=/usr/local/include
221221
# Used to run the newer GNUMake version from brew that supports --output-sync
222222
export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
223-
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
223+
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
224224
- name: Setup tmate session if tests fail
225225
if: ${{ failure() }}
226226
uses: mxschmitt/[email protected]

Diff for: Makefile

+13-13
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
55

66
# llama.cpp versions
77
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
8-
CPPLLAMA_VERSION?=e112b610a1a75cb7fa8351e1a933e2e7a755a5ce
8+
CPPLLAMA_VERSION?=ae5d0f4b899ff2842bfca561370c945ad8d4368b
99

1010
# gpt4all version
1111
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -80,8 +80,8 @@ ifeq ($(OS),Darwin)
8080
BUILD_TYPE=metal
8181
# disable metal if on Darwin and any other value is explicitly passed.
8282
else ifneq ($(BUILD_TYPE),metal)
83-
CMAKE_ARGS+=-DLLAMA_METAL=OFF
84-
export LLAMA_NO_ACCELERATE=1
83+
CMAKE_ARGS+=-DGGML_METAL=OFF
84+
export GGML_NO_ACCELERATE=1
8585
endif
8686

8787
ifeq ($(BUILD_TYPE),metal)
@@ -98,13 +98,13 @@ endif
9898

9999
ifeq ($(BUILD_TYPE),cublas)
100100
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
101-
export LLAMA_CUBLAS=1
101+
export GGML_CUDA=1
102102
export WHISPER_CUDA=1
103103
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
104104
endif
105105

106106
ifeq ($(BUILD_TYPE),vulkan)
107-
CMAKE_ARGS+=-DLLAMA_VULKAN=1
107+
CMAKE_ARGS+=-DGGML_VULKAN=1
108108
endif
109109

110110
ifeq ($(BUILD_TYPE),hipblas)
@@ -118,13 +118,13 @@ ifeq ($(BUILD_TYPE),hipblas)
118118
export WHISPER_HIPBLAS=1
119119
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
120120
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
121-
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
121+
CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
122122
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
123123
endif
124124

125125
ifeq ($(BUILD_TYPE),metal)
126126
CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
127-
export LLAMA_METAL=1
127+
export GGML_METAL=1
128128
export WHISPER_METAL=1
129129
endif
130130

@@ -354,7 +354,7 @@ else
354354
endif
355355

356356
dist-cross-linux-arm64:
357-
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
357+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
358358
STATIC=true $(MAKE) build
359359
mkdir -p release
360360
# if BUILD_ID is empty, then we don't append it to the binary name
@@ -711,21 +711,21 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
711711
cp -rf backend/cpp/llama backend/cpp/llama-avx2
712712
$(MAKE) -C backend/cpp/llama-avx2 purge
713713
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
714-
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
714+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
715715
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
716716

717717
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
718718
cp -rf backend/cpp/llama backend/cpp/llama-avx
719719
$(MAKE) -C backend/cpp/llama-avx purge
720720
$(info ${GREEN}I llama-cpp build info:avx${RESET})
721-
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
721+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
722722
cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
723723

724724
backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
725725
cp -rf backend/cpp/llama backend/cpp/llama-fallback
726726
$(MAKE) -C backend/cpp/llama-fallback purge
727727
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
728-
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
728+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
729729
cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
730730
# TODO: every binary should have its own folder instead, so can have different metal implementations
731731
ifeq ($(BUILD_TYPE),metal)
@@ -736,7 +736,7 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
736736
cp -rf backend/cpp/llama backend/cpp/llama-cuda
737737
$(MAKE) -C backend/cpp/llama-cuda purge
738738
$(info ${GREEN}I llama-cpp build info:cuda${RESET})
739-
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
739+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
740740
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
741741

742742
backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
@@ -764,7 +764,7 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
764764
cp -rf backend/cpp/llama backend/cpp/llama-grpc
765765
$(MAKE) -C backend/cpp/llama-grpc purge
766766
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
767-
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
767+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
768768
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
769769

770770
backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc

Diff for: backend/cpp/llama/Makefile

+18-11
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,42 @@ BUILD_TYPE?=
66
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
77
TARGET?=--target grpc-server
88

9-
# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
9+
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
10+
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
11+
12+
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
1013
ifeq ($(BUILD_TYPE),cublas)
11-
CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
12-
# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
14+
CMAKE_ARGS+=-DGGML_CUDA=ON
15+
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
1316
# to CMAKE_ARGS automatically
1417
else ifeq ($(BUILD_TYPE),openblas)
15-
CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
16-
# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
18+
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
19+
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
1720
else ifeq ($(BUILD_TYPE),clblas)
18-
CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
21+
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
1922
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
2023
else ifeq ($(BUILD_TYPE),hipblas)
21-
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
22-
# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
24+
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
25+
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
2326
# But if it's OSX without metal, disable it here
2427
else ifeq ($(OS),Darwin)
2528
ifneq ($(BUILD_TYPE),metal)
26-
CMAKE_ARGS+=-DLLAMA_METAL=OFF
29+
CMAKE_ARGS+=-DGGML_METAL=OFF
2730
else
31+
CMAKE_ARGS+=-DGGML_METAL=ON
32+
# Until this is tested properly, we disable embedded metal file
33+
# as we already embed it as part of the LocalAI assets
34+
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
2835
TARGET+=--target ggml-metal
2936
endif
3037
endif
3138

3239
ifeq ($(BUILD_TYPE),sycl_f16)
33-
CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
40+
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
3441
endif
3542

3643
ifeq ($(BUILD_TYPE),sycl_f32)
37-
CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
44+
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
3845
endif
3946

4047
llama.cpp:

Diff for: docs/content/docs/advanced/fine-tuning.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ And we convert it to the gguf format that LocalAI can consume:
118118

119119
# Convert to gguf
120120
git clone https://github.com/ggerganov/llama.cpp.git
121-
pushd llama.cpp && make LLAMA_CUBLAS=1 && popd
121+
pushd llama.cpp && make GGML_CUDA=1 && popd
122122

123123
# We need to convert the pytorch model into ggml for quantization
124124
# It crates 'ggml-model-f16.bin' in the 'merged' directory.

Diff for: docs/content/docs/faq.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,4 @@ This typically happens when your prompt exceeds the context size. Try to reduce
5555

5656
### I'm getting a 'SIGILL' error, what's wrong?
5757

58-
Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make build`
58+
Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make build`

Diff for: docs/content/docs/getting-started/build.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,14 @@ Here is the list of the variables available that can be used to customize the bu
101101
LocalAI uses different backends based on ggml and llama.cpp to run models. If your CPU doesn't support common instruction sets, you can disable them during build:
102102

103103
```
104-
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" make build
104+
CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" make build
105105
```
106106

107107
To have effect on the container image, you need to set `REBUILD=true`:
108108

109109
```
110110
docker run quay.io/go-skynet/localai
111-
docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
111+
docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
112112
```
113113

114114
{{% /alert %}}

Diff for: entrypoint.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ else
2222
echo "@@@@@"
2323
echo "If you are experiencing issues with the pre-compiled builds, try setting REBUILD=true"
2424
echo "If you are still experiencing issues with the build, try setting CMAKE_ARGS and disable the instructions set as needed:"
25-
echo 'CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"'
25+
echo 'CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF"'
2626
echo "see the documentation at: https://localai.io/basics/build/index.html"
2727
echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
2828
echo "@@@@@"

Diff for: examples/e2e-fine-tuning/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ And we convert it to the gguf format that LocalAI can consume:
6565

6666
# Convert to gguf
6767
git clone https://github.com/ggerganov/llama.cpp.git
68-
pushd llama.cpp && make LLAMA_CUBLAS=1 && popd
68+
pushd llama.cpp && make GGML_CUDA=1 && popd
6969

7070
# We need to convert the pytorch model into ggml for quantization
7171
# It crates 'ggml-model-f16.bin' in the 'merged' directory.

Diff for: examples/e2e-fine-tuning/notebook.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,7 @@
16001600
"source": [
16011601
"\n",
16021602
"!git clone https://github.com/ggerganov/llama.cpp.git\n",
1603-
"!cd llama.cpp && make LLAMA_CUBLAS=1\n",
1603+
"!cd llama.cpp && make GGML_CUDA=1\n",
16041604
"\n"
16051605
]
16061606
},

0 commit comments

Comments
 (0)