Skip to content

Commit

Permalink
Develop (#74)
Browse files Browse the repository at this point in the history
Add blis support for AMD cpus.
  • Loading branch information
feifeibear authored Jun 11, 2020
1 parent 4750f34 commit a47bbf1
Show file tree
Hide file tree
Showing 11 changed files with 108 additions and 172 deletions.
22 changes: 21 additions & 1 deletion 3rd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,28 @@ SET_PROPERTY(TARGET cnpy PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/
target_include_directories(cnpy INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/cnpy/include/)
add_dependencies(cnpy extern_cnpy)


if (${BLAS_PROVIDER} STREQUAL "openblas")
include(openblas.cmake)
include(eigen.cmake)
endif()


if (${BLAS_PROVIDER} STREQUAL "blis")
message(STATUS "CMAKE_CURRENT_BINARY_DIR " ${CMAKE_CURRENT_BINARY_DIR})
ExternalProject_Add(extern_blis
GIT_REPOSITORY https://github.com/flame/blis.git
GIT_TAG 0.7.0
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
CONFIGURE_COMMAND COMMAND ""
BUILD_COMMAND COMMAND ./configure --enable-threading=openmp --enable-cblas --prefix=${CMAKE_CURRENT_BINARY_DIR}/blis auto && make -j ${nproc} && make check -j ${nproc}
INSTALL_COMMAND COMMAND make install
BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/blis/lib/libblis.a)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/blis/include/blis/)
add_library(blis STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET blis PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/blis/lib/libblis.a)
target_include_directories(blis INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/blis/include/blis/)
add_dependencies(blis extern_blis)
include(eigen.cmake)
endif()
13 changes: 5 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_FLAGS "-Wall")
set(CMAKE_C_FLAGS "-Wall")

set(TURBO_TRANSFORMERS_VERSION 0.2.0)
set(TURBO_TRANSFORMERS_VERSION 0.2.1)

option(WITH_PROFILER "Compile with gperftools" OFF)
option(WITH_GPU "Build with GPU" OFF)
option(WITH_MODULE_BENCHMAKR "Build with GPU" ON)
option(WITH_MODULE_BENCHMAKR "Catch2 unitest with benchmarking" ON)


if (WITH_GPU)
Expand All @@ -42,14 +42,12 @@ if(WITH_GPU)
endif()

set(MKLROOT "/opt/intel/mkl" CACHE PATH "The mkl library root")
set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl]")
set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl, blis]")
if (${BLAS_PROVIDER} STREQUAL "mkl")
find_package(MKL REQUIRED)
endif()
if (${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
message(STATUS "Fast Transformer is built with a intel compiler!")
add_definitions(-D__USE_INTEL_COMPILER__)
endif ()

message(STATUS "Blas provider is ${BLAS_PROVIDER}")

add_subdirectory(3rd)
include_directories(3rd/FP16/include)
Expand All @@ -65,7 +63,6 @@ else ()
message(WARNING "OpenMP is not supported")
endif ()

message(STATUS "Blas provider is ${BLAS_PROVIDER}")

if (WITH_PROFILER)
find_package(Gperftools REQUIRED)
Expand Down
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
## turbo_transformers: a fast and user-friendly tool for transformer inference on CPU and GPU
[Chinese Version](./README_cn.md)
![logo](./images/logo.jpeg)

### **make transformers serving fast by adding a turbo to your inference engine!**
Expand Down Expand Up @@ -43,14 +42,15 @@ Method 1:I want to unitest
cd /workspace
sh tools/build_and_run_unittests.sh $PWD -DWITH_GPU=OFF
# you can switch between Openblas and MKL by modifying this line in CMakeList.txt
# set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl]")
# set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl, blis]")
```
Method 2:I do not want to unitest
```
cd /workspace
mkdir -p build && cd build
cmake .. -DWITH_GPU=OFF
make -j 4
make -j 4
pip install `find . -name *whl`
```
3. Run benchmark (optional) in docker, compare with pytorch, torch-JIT, onnxruntime
Expand All @@ -67,7 +67,7 @@ sh tool/build_conda_package.sh

*We also prepared a docker image containing CPU version of TurboTransformers, as well as other related works, i.e. onnxrt v1.2.0 and pytorch-jit on dockerhub*
```
*docker pull thufeifeibear/turbo_transformers:0.2.0-release-cpu-dev*
docker pull thufeifeibear/turbo_transformers:0.2.0-release-cpu-dev
```
### Installation on GPU
```
Expand All @@ -94,7 +94,7 @@ bash gpu_run_benchmark.sh
```
*We also prepared a docker image containing GPU version of TurboTransformers.
```
*docker pull thufeifeibear/turbo_transformers:0.2.0-cuda10.0-cudnn7-devel-ubuntu18.04-gpu-release*
docker pull thufeifeibear/turbo_transformers:0.2.0-cuda10.0-cudnn7-devel-ubuntu18.04-gpu-release
```

### Usage
Expand Down Expand Up @@ -170,4 +170,4 @@ weight = torch.clone(torch.t(pooler_params['dense.weight']))

## Contact us
Although we recommand you post your problem with github issues, you can also join in our Turbo user group.
Scan this [QR code](./images/namecode.pdf "qrcode") and our contactor as your WeChat friend.
Scan this [QR code](./images/namecode.pdf "qrcode") and add our contactor as your WeChat friend.
146 changes: 0 additions & 146 deletions README_cn.md

This file was deleted.

3 changes: 3 additions & 0 deletions turbo_transformers/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ if (${BLAS_PROVIDER} STREQUAL "mkl")
elseif (${BLAS_PROVIDER} STREQUAL "openblas")
target_link_libraries(tt_core PUBLIC OpenBlas::OpenBlas PUBLIC Eigen3::Eigen)
target_compile_definitions(tt_core PUBLIC -DTT_BLAS_USE_OPENBLAS)
elseif (${BLAS_PROVIDER} STREQUAL "blis")
target_link_libraries(tt_core PUBLIC blis Eigen3::Eigen)
target_compile_definitions(tt_core PUBLIC -DTT_BLAS_USE_BLIS)
endif ()


Expand Down
15 changes: 13 additions & 2 deletions turbo_transformers/core/blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,22 @@ namespace turbo_transformers {
using BlasInt = MKL_INT;
}

#else
#elif defined(TT_BLAS_USE_OPENBLAS) || defined(TT_BLAS_USE_BLIS)
#include "cblas.h"
#if defined(TT_BLAS_USE_OPENBLAS)

namespace turbo_transformers {
using BlasInt = blasint;
} // namespace turbo_transformers
#elif defined(TT_BLAS_USE_BLIS)
#include <unistd.h>

namespace turbo_transformers {
using BlasInt = f77_int;
} // namespace turbo_transformers

using blasint = turbo_transformers::BlasInt;
#endif

extern "C" {
void cblas_sgemm_batch(const CBLAS_ORDER Layout,
Expand All @@ -39,5 +50,5 @@ void cblas_sgemm_batch(const CBLAS_ORDER Layout,
const blasint* group_size);
void vsTanh(blasint N, const float* in, float* out);
}

#else
#endif
50 changes: 50 additions & 0 deletions turbo_transformers/core/blas_blis.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (C) 2020 THL A29 Limited, a Tencent company.
// All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may
// not use this file except in compliance with the License. You may
// obtain a copy of the License at
// https://opensource.org/licenses/BSD-3-Clause
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
// See the AUTHORS file for names of contributors.
#include "blas.h"
#define EIGEN_DONT_PARALLELIZE
#include "unsupported/Eigen/CXX11/Tensor"
extern "C" {
void cblas_sgemm_batch(const CBLAS_ORDER Layout,
const CBLAS_TRANSPOSE* transa_array,
const CBLAS_TRANSPOSE* transb_array,
const blasint* m_array, const blasint* n_array,
const blasint* k_array, const float* alpha_array,
const float** a_array, const blasint* lda_array,
const float** b_array, const blasint* ldb_array,
const float* beta_array, float** c_array,
const blasint* ldc_array, const blasint group_count,
const blasint* group_size) {
int idx = 0;
for (int i = 0; i < group_count; ++i) {
auto alpha = alpha_array[i];
auto beta = beta_array[i];
for (int j = 0; j < group_size[i]; ++j) {
cblas_sgemm(Layout, transa_array[i], transb_array[i], m_array[i],
n_array[i], k_array[i], alpha, a_array[idx], lda_array[i],
b_array[idx], ldb_array[i], beta, c_array[idx], ldc_array[i]);
++idx;
}
}
}

using Vec = Eigen::TensorMap<Eigen::Tensor<float, 1>>;

void vsTanh(blasint N, const float* in, float* out) {
Vec input(const_cast<float*>(in), N);
Vec output(out, N);

// let use eigen to calculate tanh.
// Eigen can use `FAST_MATH`.
output = input.tanh();
}
}
5 changes: 4 additions & 1 deletion turbo_transformers/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ void SetNumThreads(int n_th) {
// The order seems important. Set MKL NUM_THREADS before OMP.
#ifdef TT_BLAS_USE_MKL
mkl_set_num_threads(n_th);
#else
#elif TT_BLAS_USE_OPENBLAS
openblas_set_num_threads(n_th);
#elif TT_BLAS_USE_BLIS
#endif
#ifdef _OPENMP
omp_set_num_threads(n_th);
Expand All @@ -37,6 +38,8 @@ BlasProvider GetBlasProvider() {
return BlasProvider::MKL;
#elif defined(TT_BLAS_USE_OPENBLAS)
return BlasProvider::OpenBlas;
#elif defined(TT_BLAS_USE_BLIS)
return BlasProvider::BLIS;
#else
#error "unexpected code";
#endif
Expand Down
5 changes: 1 addition & 4 deletions turbo_transformers/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
#pragma once
namespace turbo_transformers {
namespace core {
enum class BlasProvider {
MKL,
OpenBlas,
};
enum class BlasProvider { MKL, OpenBlas, BLIS };

BlasProvider GetBlasProvider();

Expand Down
4 changes: 2 additions & 2 deletions turbo_transformers/loaders/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
# See the AUTHORS file for names of contributors.
add_library(tt_npz_loader npz_load.cpp)
target_link_libraries(tt_npz_loader
PUBLIC dlpack cnpy
PRIVATE tt_core zlib
PUBLIC dlpack cnpy zlib
PRIVATE tt_core
)
Loading

0 comments on commit a47bbf1

Please sign in to comment.