diff --git a/README.md b/README.md index 6ab6acf1282c6..5f7933c132dc3 100644 --- a/README.md +++ b/README.md @@ -459,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) -## Other documentations +## Other documentation - [main (cli)](./examples/main/README.md) - [server](./examples/server/README.md) - [jeopardy](./examples/jeopardy/README.md) - [GBNF grammars](./grammars/README.md) -**Development documentations** +**Development documentation** - [How to build](./docs/build.md) - [Running on Docker](./docs/docker.md) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 40ed2bdf375e9..860552f3a96df 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -6,15 +6,18 @@ if (CUDAToolkit_FOUND) message(STATUS "CUDA Toolkit found") if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # 52 == lowest CUDA 12 standard - # 60 == FP16 CUDA intrinsics - # 61 == integer CUDA intrinsics - # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster - if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + # native == GPUs available at build time + # 52 == Maxwell, lowest CUDA 12 standard + # 60 == P100, FP16 CUDA intrinsics + # 61 == Pascal, __dp4a instruction (per-byte integer dot product) + # 70 == V100, FP16 tensor cores + # 75 == Turing, int6 tensor cores + if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6") + set(CMAKE_CUDA_ARCHITECTURES "native") + elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") else() set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") - #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")