Develop (#74)

Add blis support for AMD cpus.
Tencent · Jun 11, 2020 · a47bbf1 · a47bbf1
1 parent 4750f34
commit a47bbf1
Show file tree

Hide file tree

Showing 11 changed files with 108 additions and 172 deletions.
diff --git a/3rd/CMakeLists.txt b/3rd/CMakeLists.txt
@@ -56,8 +56,28 @@ SET_PROPERTY(TARGET cnpy PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/
 target_include_directories(cnpy INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/cnpy/include/)
 add_dependencies(cnpy extern_cnpy)
 
-
 if (${BLAS_PROVIDER} STREQUAL "openblas")
     include(openblas.cmake)
     include(eigen.cmake)
 endif()
+
+
+if (${BLAS_PROVIDER} STREQUAL "blis")
+  message(STATUS "CMAKE_CURRENT_BINARY_DIR " ${CMAKE_CURRENT_BINARY_DIR})
+        ExternalProject_Add(extern_blis
+                GIT_REPOSITORY https://github.com/flame/blis.git
+                GIT_TAG 0.7.0
+                BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
+                SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
+                INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
+                CONFIGURE_COMMAND COMMAND ""
+                BUILD_COMMAND COMMAND ./configure --enable-threading=openmp --enable-cblas --prefix=${CMAKE_CURRENT_BINARY_DIR}/blis auto && make -j ${nproc} && make check -j ${nproc}
+                INSTALL_COMMAND COMMAND make install
+                BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/blis/lib/libblis.a)
+        file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/blis/include/blis/)
+        add_library(blis STATIC IMPORTED GLOBAL)
+        SET_PROPERTY(TARGET blis PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/blis/lib/libblis.a)
+        target_include_directories(blis INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/blis/include/blis/)
+        add_dependencies(blis extern_blis)
+        include(eigen.cmake)
+endif()
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,11 +21,11 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_FLAGS "-Wall")
 set(CMAKE_C_FLAGS "-Wall")
 
-set(TURBO_TRANSFORMERS_VERSION 0.2.0)
+set(TURBO_TRANSFORMERS_VERSION 0.2.1)
 
 option(WITH_PROFILER  "Compile with gperftools" OFF)
 option(WITH_GPU       "Build with GPU"          OFF)
-option(WITH_MODULE_BENCHMAKR       "Build with GPU"          ON)
+option(WITH_MODULE_BENCHMAKR       "Catch2 unitest with benchmarking"          ON)
 
 
 if (WITH_GPU)
@@ -42,14 +42,12 @@ if(WITH_GPU)
 endif()
 
 set(MKLROOT "/opt/intel/mkl" CACHE PATH "The mkl library root")
-set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl]")
+set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl, blis]")
 if (${BLAS_PROVIDER} STREQUAL "mkl")
     find_package(MKL REQUIRED)
 endif()
-if (${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
-    message(STATUS "Fast Transformer is built with a intel compiler!")
-    add_definitions(-D__USE_INTEL_COMPILER__)
-endif ()
+
+message(STATUS "Blas provider is ${BLAS_PROVIDER}")
 
 add_subdirectory(3rd)
 include_directories(3rd/FP16/include)
@@ -65,7 +63,6 @@ else ()
     message(WARNING "OpenMP is not supported")
 endif ()
 
-message(STATUS "Blas provider is ${BLAS_PROVIDER}")
 
 if (WITH_PROFILER)
     find_package(Gperftools REQUIRED)

diff --git a/README.md b/README.md
@@ -1,5 +1,4 @@
 ## turbo_transformers: a fast and user-friendly tool for transformer inference on CPU and GPU
-[Chinese Version](./README_cn.md)
 ![logo](./images/logo.jpeg)
 
 ### **make transformers serving fast by adding a turbo to your inference engine!**
@@ -43,14 +42,15 @@ Method 1：I want to unitest
 cd /workspace
 sh tools/build_and_run_unittests.sh $PWD -DWITH_GPU=OFF
 # you can switch between Openblas and MKL by modifying this line in CMakeList.txt
-# set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl]")
+# set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl, blis]")
+
 ```
 Method 2：I do not want to unitest
 ```
 cd /workspace
 mkdir -p build && cd build
 cmake .. -DWITH_GPU=OFF
-make -j 4 
+make -j 4
 pip install `find . -name *whl`
 ```
 3. Run benchmark (optional) in docker, compare with pytorch, torch-JIT, onnxruntime
@@ -67,7 +67,7 @@ sh tool/build_conda_package.sh
 
 *We also prepared a docker image containing CPU version of TurboTransformers, as well as other related works, i.e. onnxrt v1.2.0 and pytorch-jit on dockerhub*
 ```
-*docker pull thufeifeibear/turbo_transformers:0.2.0-release-cpu-dev*
+docker pull thufeifeibear/turbo_transformers:0.2.0-release-cpu-dev
 ```
 ### Installation on GPU
 ```
@@ -94,7 +94,7 @@ bash gpu_run_benchmark.sh
 ```
 *We also prepared a docker image containing GPU version of TurboTransformers.
 ```
-*docker pull thufeifeibear/turbo_transformers:0.2.0-cuda10.0-cudnn7-devel-ubuntu18.04-gpu-release*
+docker pull thufeifeibear/turbo_transformers:0.2.0-cuda10.0-cudnn7-devel-ubuntu18.04-gpu-release
 ```
 
 ### Usage
@@ -170,4 +170,4 @@ weight = torch.clone(torch.t(pooler_params['dense.weight']))
 
 ## Contact us
 Although we recommand you post your problem with github issues, you can also join in our Turbo user group.
-Scan this [QR code](./images/namecode.pdf "qrcode") and our contactor as your WeChat friend.
+Scan this [QR code](./images/namecode.pdf "qrcode") and add our contactor as your WeChat friend.
diff --git a/README_cn.md b/README_cn.md
diff --git a/turbo_transformers/core/CMakeLists.txt b/turbo_transformers/core/CMakeLists.txt
@@ -41,6 +41,9 @@ if (${BLAS_PROVIDER} STREQUAL "mkl")
 elseif (${BLAS_PROVIDER} STREQUAL "openblas")
     target_link_libraries(tt_core PUBLIC OpenBlas::OpenBlas PUBLIC Eigen3::Eigen)
     target_compile_definitions(tt_core PUBLIC -DTT_BLAS_USE_OPENBLAS)
+elseif (${BLAS_PROVIDER} STREQUAL "blis")
+    target_link_libraries(tt_core PUBLIC blis Eigen3::Eigen)
+    target_compile_definitions(tt_core PUBLIC -DTT_BLAS_USE_BLIS)
 endif ()
 
 

diff --git a/turbo_transformers/core/blas.h b/turbo_transformers/core/blas.h
@@ -20,11 +20,22 @@ namespace turbo_transformers {
 using BlasInt = MKL_INT;
 }
 
-#else
+#elif defined(TT_BLAS_USE_OPENBLAS) || defined(TT_BLAS_USE_BLIS)
 #include "cblas.h"
+#if defined(TT_BLAS_USE_OPENBLAS)
+
 namespace turbo_transformers {
 using BlasInt = blasint;
 }  // namespace turbo_transformers
+#elif defined(TT_BLAS_USE_BLIS)
+#include <unistd.h>
+
+namespace turbo_transformers {
+using BlasInt = f77_int;
+}  // namespace turbo_transformers
+
+using blasint = turbo_transformers::BlasInt;
+#endif
 
 extern "C" {
 void cblas_sgemm_batch(const CBLAS_ORDER Layout,
@@ -39,5 +50,5 @@ void cblas_sgemm_batch(const CBLAS_ORDER Layout,
                        const blasint* group_size);
 void vsTanh(blasint N, const float* in, float* out);
 }
-
+#else
 #endif
diff --git a/turbo_transformers/core/blas_blis.cpp b/turbo_transformers/core/blas_blis.cpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2020 THL A29 Limited, a Tencent company.
+// All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may
+// not use this file except in compliance with the License. You may
+// obtain a copy of the License at
+// https://opensource.org/licenses/BSD-3-Clause
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" basis,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+// See the AUTHORS file for names of contributors.
+#include "blas.h"
+#define EIGEN_DONT_PARALLELIZE
+#include "unsupported/Eigen/CXX11/Tensor"
+extern "C" {
+void cblas_sgemm_batch(const CBLAS_ORDER Layout,
+                       const CBLAS_TRANSPOSE* transa_array,
+                       const CBLAS_TRANSPOSE* transb_array,
+                       const blasint* m_array, const blasint* n_array,
+                       const blasint* k_array, const float* alpha_array,
+                       const float** a_array, const blasint* lda_array,
+                       const float** b_array, const blasint* ldb_array,
+                       const float* beta_array, float** c_array,
+                       const blasint* ldc_array, const blasint group_count,
+                       const blasint* group_size) {
+  int idx = 0;
+  for (int i = 0; i < group_count; ++i) {
+    auto alpha = alpha_array[i];
+    auto beta = beta_array[i];
+    for (int j = 0; j < group_size[i]; ++j) {
+      cblas_sgemm(Layout, transa_array[i], transb_array[i], m_array[i],
+                  n_array[i], k_array[i], alpha, a_array[idx], lda_array[i],
+                  b_array[idx], ldb_array[i], beta, c_array[idx], ldc_array[i]);
+      ++idx;
+    }
+  }
+}
+
+using Vec = Eigen::TensorMap<Eigen::Tensor<float, 1>>;
+
+void vsTanh(blasint N, const float* in, float* out) {
+  Vec input(const_cast<float*>(in), N);
+  Vec output(out, N);
+
+  // let use eigen to calculate tanh.
+  // Eigen can use `FAST_MATH`.
+  output = input.tanh();
+}
+}
diff --git a/turbo_transformers/core/config.cpp b/turbo_transformers/core/config.cpp
@@ -24,8 +24,9 @@ void SetNumThreads(int n_th) {
 // The order seems important. Set MKL NUM_THREADS before OMP.
 #ifdef TT_BLAS_USE_MKL
   mkl_set_num_threads(n_th);
-#else
+#elif TT_BLAS_USE_OPENBLAS
   openblas_set_num_threads(n_th);
+#elif TT_BLAS_USE_BLIS
 #endif
 #ifdef _OPENMP
   omp_set_num_threads(n_th);
@@ -37,6 +38,8 @@ BlasProvider GetBlasProvider() {
   return BlasProvider::MKL;
 #elif defined(TT_BLAS_USE_OPENBLAS)
   return BlasProvider::OpenBlas;
+#elif defined(TT_BLAS_USE_BLIS)
+  return BlasProvider::BLIS;
 #else
 #error "unexpected code";
 #endif

diff --git a/turbo_transformers/core/config.h b/turbo_transformers/core/config.h
@@ -14,10 +14,7 @@
 #pragma once
 namespace turbo_transformers {
 namespace core {
-enum class BlasProvider {
-  MKL,
-  OpenBlas,
-};
+enum class BlasProvider { MKL, OpenBlas, BLIS };
 
 BlasProvider GetBlasProvider();
 

diff --git a/turbo_transformers/loaders/CMakeLists.txt b/turbo_transformers/loaders/CMakeLists.txt
@@ -12,6 +12,6 @@
 # See the AUTHORS file for names of contributors.
 add_library(tt_npz_loader npz_load.cpp)
 target_link_libraries(tt_npz_loader
-        PUBLIC dlpack cnpy
-        PRIVATE tt_core zlib
+        PUBLIC dlpack cnpy zlib
+        PRIVATE tt_core
 )