diff --git a/CMakeLists.txt b/CMakeLists.txt index 961b3284..b8231e3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,8 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_FLAGS "-Wall") set(CMAKE_C_FLAGS "-Wall") -set(TURBO_TRANSFORMERS_VERSION 0.5.0) + +set(TURBO_TRANSFORMERS_VERSION 0.5.1) option(WITH_PROFILER "Compile with profiler" OFF) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 780769d4..3227e0cc 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -30,9 +30,6 @@ --enable_mem_opt Use model aware memory optimization for BERT. """ -import json -import os - import docopt from turbo_benchmark_helper import benchmark_turbo_transformers from torch_benchmark_helper import benchmark_torch diff --git a/benchmark/onnx_benchmark_helper.py b/benchmark/onnx_benchmark_helper.py index fff7afe4..6b4ac69f 100644 --- a/benchmark/onnx_benchmark_helper.py +++ b/benchmark/onnx_benchmark_helper.py @@ -26,7 +26,6 @@ def generate_onnx_model(model_name: str, use_dynamic_axes: bool = False): import transformers import torch - import os test_device = torch.device( 'cuda:0') if backend == "GPU" and use_gpu else torch.device('cpu:0') @@ -45,6 +44,9 @@ def generate_onnx_model(model_name: str, elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) + elif model_name == "distilbert": + cfg = transformers.DistilBertConfig() + model = transformers.DistilBertModel(cfg) else: raise (f"benchmark does not support {model_name}") diff --git a/benchmark/torch_benchmark_helper.py b/benchmark/torch_benchmark_helper.py index 784d4269..5b960df3 100644 --- a/benchmark/torch_benchmark_helper.py +++ b/benchmark/torch_benchmark_helper.py @@ -22,10 +22,6 @@ def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') - if use_gpu: - print("using GPU") - else: - print("using CPU") torch.set_grad_enabled(False) torch.set_num_threads(num_threads) @@ -39,6 +35,9 @@ def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) + elif model_name == "distilbert": + cfg = transformers.DistilBertConfig() + model = transformers.DistilBertModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() diff --git a/benchmark/turbo_benchmark_helper.py b/benchmark/turbo_benchmark_helper.py index 8871bb5c..fbab9e38 100644 --- a/benchmark/turbo_benchmark_helper.py +++ b/benchmark/turbo_benchmark_helper.py @@ -24,10 +24,6 @@ def benchmark_turbo_transformers(model_name: str, seq_len: int, import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') - if use_gpu: - print("using GPU") - else: - print("using CPU") cfg = None torch.set_grad_enabled(False) if model_name == "bert": @@ -48,6 +44,12 @@ def benchmark_turbo_transformers(model_name: str, seq_len: int, model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) + elif model_name == "distilbert": + cfg = transformers.DistilBertConfig() + model = transformers.DistilBertModel(cfg) + model.to(test_device) + model.eval() + model = turbo_transformers.DistilBertModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") diff --git a/distrill/bert_model.txt b/distrill/bert_model.txt new file mode 100644 index 00000000..e69de29b diff --git a/distrill/distill_bert.txt b/distrill/distill_bert.txt new file mode 100644 index 00000000..e69de29b diff --git a/distrill/distrill_bert.py b/distrill/distrill_bert.py new file mode 100644 index 00000000..a105c519 --- /dev/null +++ b/distrill/distrill_bert.py @@ -0,0 +1,45 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +from transformers import DistilBertTokenizer, DistilBertModel +from transformers import BertTokenizer, BertModel +import torch + +tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") +inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") +# inputs = torch.randint(low=0, +# high=cfg.vocab_size - 1, +# size=(1, 10), +# dtype=torch.long, +# device=torch.device("cpu:0")) + +## distrillation model +model = DistilBertModel.from_pretrained("distilbert-base-uncased", + return_dict=True) + +## bert model +bert_model = BertModel.from_pretrained("bert-base-uncased", return_dict=True) + +cfg = model.config +print(cfg) +print(inputs) +outputs = model(**inputs) +bert_outputs = bert_model(**inputs) + +print(model) +print(bert_model) + +# print(bert_outputs - outputs) +# +# last_hidden_states = outputs.last_hidden_state +# print(last_hidden_states) diff --git a/requirements.txt b/requirements.txt index d4b250f5..f020e066 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ contexttimer onnx future -transformers==3.0.2 +transformers==3.4.0 diff --git a/tools/docker/Dockerfile_release.cpu b/tools/docker/Dockerfile_release.cpu index 4b96e547..302b6be9 100644 --- a/tools/docker/Dockerfile_release.cpu +++ b/tools/docker/Dockerfile_release.cpu @@ -14,5 +14,5 @@ RUN /opt/conda/bin/conda install pytorch==1.5.0 cpuonly -c pytorch && \ /opt/conda/bin/conda install make cmake git graphviz gperftools git-lfs docopt -c conda-forge && \ /opt/conda/bin/conda clean -afy -RUN pip --no-cache-dir install contexttimer future transformers==3.0.2 docopt onnxruntime-tools +RUN pip --no-cache-dir install contexttimer future transformers==3.4.0 docopt onnxruntime-tools WORKDIR /workspace diff --git a/tools/docker/Dockerfile_release.gpu b/tools/docker/Dockerfile_release.gpu index 82eb5a44..6ab7ea84 100644 --- a/tools/docker/Dockerfile_release.gpu +++ b/tools/docker/Dockerfile_release.gpu @@ -13,7 +13,9 @@ RUN curl -LO https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3- conda install pytorch=PYTORCH_VERSION cudatoolkit=CUDA_VERSION cudnn --freeze-installed -c pytorch && \ conda clean -yfa -RUN pip --no-cache-dir install contexttimer future transformers==3.0.2 docopt OpenNMT-py==1.1.0 onnxruntime-gpu==1.3.0 + +RUN pip --no-cache-dir install contexttimer future transformers==3.4.0 docopt OpenNMT-py==1.1.0 onnxruntime-gpu==1.3.0 + COPY --from=DEV_IMAGE /opt/miniconda3/lib/python3.7/site-packages/turbo_transformers /opt/miniconda3/lib/python3.7/site-packages/turbo_transformers diff --git a/turbo_transformers/layers/positionwise_ffn.cpp b/turbo_transformers/layers/positionwise_ffn.cpp index fbec26bf..09398ac4 100644 --- a/turbo_transformers/layers/positionwise_ffn.cpp +++ b/turbo_transformers/layers/positionwise_ffn.cpp @@ -76,5 +76,53 @@ void PositionwiseFeedForward::operator()(const core::Tensor& input_tensor, void PositionwiseFeedForward::EnforceShapeAndType() const {} +void DistrillFFN::operator()(const core::Tensor& input_tensor, + core::Tensor* output_tensor, + bool is_trans_weight) const { + auto d_ff = + is_trans_weight ? dense_weight_1_.shape(0) : dense_weight_1_.shape(1); + + auto model_dim_weight = + is_trans_weight ? dense_weight_1_.shape(1) : dense_weight_1_.shape(0); + auto model_dim = input_tensor.shape(2); + + TT_ENFORCE_EQ( + model_dim_weight, model_dim, + "dense weight and input tensor should have the same model_dim."); + + auto devType = input_tensor.device_type(); + auto devId = input_tensor.device_id(); + + // input tensor size (batch_size, input_len, model_dim) + auto batch_size = input_tensor.shape(0); + auto input_len = input_tensor.shape(1); + // allocate memory for temp data + core::Tensor input_tensor_copy(nullptr); + input_tensor_copy.Reshape({batch_size, input_len, model_dim}, devType, + devId); + core::Tensor temp_tensor(nullptr); + temp_tensor.Reshape({batch_size * input_len, d_ff}, devType, devId); + + // start computation + core::Copy(input_tensor, input_tensor_copy, "FFN/AddInputBias"); + + output_tensor->Reshape({batch_size, input_len, model_dim}, devType, + devId, "FFN/Reshape"); + kernels::MatMul(input_tensor_copy, false, dense_weight_1_, is_trans_weight, + 1.0, // input (b*seq, model) X dense_weight_1_ (model_dim, + // d_ff) -> temp_tensor (B*seq, d_ff) + &temp_tensor, 0.0, "FFN/gemm0"); + kernels::AddBiasAct( + dense_bias_1_, &temp_tensor, "FFN/AddBiasAct"); + kernels::MatMul(temp_tensor, false, dense_weight_2_, is_trans_weight, 1.0, + &input_tensor_copy, 0.0, "FFN/gemm1"); + kernels::AddInputBias(input_tensor, input_tensor_copy, dense_bias_2_, + output_tensor, "FFN/AddInputBias"); + kernels::LayerNorm(layer_norm_weight_, layer_norm_bias_, output_tensor, + 1e-12, "FFN/LayerNorm"); +} + +void DistrillFFN::EnforceShapeAndType() const {} + } // namespace layers } // namespace turbo_transformers diff --git a/turbo_transformers/layers/positionwise_ffn.h b/turbo_transformers/layers/positionwise_ffn.h index 9e091872..a3ab691d 100644 --- a/turbo_transformers/layers/positionwise_ffn.h +++ b/turbo_transformers/layers/positionwise_ffn.h @@ -14,6 +14,7 @@ #pragma once #include #include + #include "turbo_transformers/core/tensor.h" namespace turbo_transformers { @@ -51,5 +52,34 @@ class PositionwiseFeedForward { core::Tensor layer_norm_bias_; }; +class DistrillFFN { + public: + DistrillFFN(core::Tensor dense_weight_1, core::Tensor dense_bias_1, + core::Tensor dense_weight_2, core::Tensor dense_bias_2, + core::Tensor layer_norm_weight, core::Tensor layer_norm_bias) + : dense_weight_1_(std::move(dense_weight_1)), + dense_bias_1_(std::move(dense_bias_1)), + dense_weight_2_(std::move(dense_weight_2)), + dense_bias_2_(std::move(dense_bias_2)), + layer_norm_weight_(std::move(layer_norm_weight)), + layer_norm_bias_(std::move(layer_norm_bias)) { + EnforceShapeAndType(); + } + void EnforceShapeAndType() const; + + // according to profiling results on Intel 61xx, is_trans_weight = true is + // faster + void operator()(const core::Tensor &input_tensor, core::Tensor *output, + bool is_trans_weight = true) const; + + private: + core::Tensor dense_weight_1_; + core::Tensor dense_bias_1_; + core::Tensor dense_weight_2_; + core::Tensor dense_bias_2_; + core::Tensor layer_norm_weight_; + core::Tensor layer_norm_bias_; +}; + } // namespace layers } // namespace turbo_transformers diff --git a/turbo_transformers/python/pybind.cpp b/turbo_transformers/python/pybind.cpp index c6253d5d..74e17cbb 100644 --- a/turbo_transformers/python/pybind.cpp +++ b/turbo_transformers/python/pybind.cpp @@ -221,6 +221,18 @@ PYBIND11_MODULE(turbo_transformers_cxx, m) { })) .def("__call__", &layers::PositionwiseFeedForward::operator()); + py::class_(m, "DistrillFFN") + .def(py::init([](core::Tensor &dense_weight_1, core::Tensor &dense_bias_1, + core::Tensor &dense_weight_2, core::Tensor &dense_bias_2, + core::Tensor &layer_norm_weight, + core::Tensor &layer_norm_bias) -> layers::DistrillFFN * { + return new layers::DistrillFFN( + std::move(dense_weight_1), std::move(dense_bias_1), + std::move(dense_weight_2), std::move(dense_bias_2), + std::move(layer_norm_weight), std::move(layer_norm_bias)); + })) + .def("__call__", &layers::DistrillFFN::operator()); + py::class_(m, "FusedAddBiasGELU") .def(py::init([](core::Tensor &dense_bias) -> layers::FusedAddBiasGELU * { return new layers::FusedAddBiasGELU(std::move(dense_bias)); diff --git a/turbo_transformers/python/tests/distill_bert_attention_test.py b/turbo_transformers/python/tests/distill_bert_attention_test.py new file mode 100644 index 00000000..eb19c5ad --- /dev/null +++ b/turbo_transformers/python/tests/distill_bert_attention_test.py @@ -0,0 +1,117 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +import turbo_transformers + +import unittest +import sys +import torch +from transformers.modeling_distilbert import DistilBertConfig +from transformers.modeling_distilbert import MultiHeadSelfAttention as DistilAttention +from torch import nn + +import os +sys.path.append(os.path.dirname(__file__)) +import test_helper + +fname = "tt_distrill_attention.txt" + + +def create_test(batch_size, seq_length): + class TestDistillBertAttention(unittest.TestCase): + def init_data(self, use_cuda): + test_device = torch.device('cuda:0') if use_cuda else \ + torch.device('cpu:0') + if not use_cuda: + torch.set_num_threads(4) + turbo_transformers.set_num_threads(4) + + torch.set_grad_enabled(False) + self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, + hidden_dropout_prob=0.0) + self.cfg.output_attentions = True + self.torch_attention = DistilAttention(self.cfg) + self.torch_sa_layer_norm = nn.LayerNorm( + normalized_shape=self.cfg.dim, eps=1e-12) + self.torch_attention.eval() + self.torch_sa_layer_norm.eval() + if use_cuda: + self.torch_attention.to(test_device) + self.torch_sa_layer_norm.to(test_device) + + # Get FT Attention + self.turbo_attention = turbo_transformers.DistillBertAttention.from_torch( + self.torch_attention, self.torch_sa_layer_norm) + + hidden_size = self.cfg.hidden_size + self.input_tensor = torch.rand(size=(batch_size, seq_length, + hidden_size), + dtype=torch.float32, + device=test_device) + # NOTE, the mask of distilled attention is different from huggingface bert attention. + self.attention_mask = torch.ones((batch_size, seq_length), + dtype=torch.float32, + device=test_device) + + def check_torch_and_turbo(self, use_cuda, num_iter=1): + self.init_data(use_cuda) + device = "GPU" if use_cuda else "CPU" + torch_model = lambda: self.torch_sa_layer_norm( + self.torch_attention(query=self.input_tensor, + key=self.input_tensor, + value=self.input_tensor, + mask=self.attention_mask, + output_attentions=False)[0] + self. + input_tensor) + torch_attention_result, torch_qps, torch_time_consume = \ + test_helper.run_model(torch_model, use_cuda, num_iter, use_profile=False) + print( + f"DistilAttention+LN \"({batch_size},{seq_length:03})\" ", + f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") + + turbo_model = lambda: self.turbo_attention( + self.input_tensor, + self.attention_mask, + output_attentions=self.cfg.output_attentions)[0] + + turbo_attention_result, turbo_qps, turbo_time_consume = \ + test_helper.run_model(turbo_model, use_cuda, + num_iter) + print( + f"DistilAttention \"({batch_size},{seq_length:03})\" ", + f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}" + ) + + self.assertTrue( + torch.max( + torch.abs(torch_attention_result - turbo_attention_result)) + < (1e-3 if use_cuda else 1e-4)) + + def test_distillbert_attention(self): + self.check_torch_and_turbo(use_cuda=False, num_iter=1) + if torch.cuda.is_available() and \ + turbo_transformers.config.is_compiled_with_cuda(): + self.check_torch_and_turbo(use_cuda=True, num_iter=1) + + globals( + )[f"TestDistillBertAtt{batch_size}_{seq_length:3}"] = TestDistillBertAttention + + +with open(fname, "w") as fh: + fh.write(", torch, turbo_transformers\n") +for batch_size in [1, 2]: + for seq_length in [10, 20, 128]: + create_test(batch_size, seq_length) + +if __name__ == '__main__': + unittest.main() diff --git a/turbo_transformers/python/tests/distill_bert_ffn_test.py b/turbo_transformers/python/tests/distill_bert_ffn_test.py new file mode 100644 index 00000000..63af780a --- /dev/null +++ b/turbo_transformers/python/tests/distill_bert_ffn_test.py @@ -0,0 +1,110 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +import turbo_transformers + +import unittest +import sys +import torch +import os + +from transformers.modeling_distilbert import DistilBertConfig +from transformers.modeling_distilbert import FFN as DistilFFN + +sys.path.append(os.path.dirname(__file__)) +import test_helper + +fname = "distrill_ffn.txt" + + +def create_test(batch_size, input_len): + class TestDistillFFN(unittest.TestCase): + def init_data(self, use_cuda): + self.test_device = torch.device('cuda:0') if use_cuda else \ + torch.device('cpu:0') + if not use_cuda: + torch.set_num_threads(4) + turbo_transformers.set_num_threads(4) + + self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, + hidden_dropout_prob=0.0) + + torch.set_grad_enabled(False) + self.torch_ffn = DistilFFN(self.cfg) + self.torch_ffn.eval() + self.output_layer_norm = torch.nn.LayerNorm( + normalized_shape=self.cfg.dim, eps=1e-12) + if use_cuda: + self.torch_ffn.to(self.test_device) + self.output_layer_norm.to(self.test_device) + + self.turbo_ffn = turbo_transformers.DistrillFFN.from_torch( + self.torch_ffn, self.output_layer_norm) + # (batch_size, input_len, model_dim) + self.inputs = torch.rand(size=(batch_size, input_len, + self.cfg.dim), + dtype=torch.float32, + device=self.test_device) + + print(self.cfg.activation) + + def check_torch_and_turbo(self, use_cuda, num_iter=1): + self.init_data(use_cuda) + device = "GPU" if use_cuda else "CPU" + + torch_model = lambda: self.output_layer_norm( + self.torch_ffn(self.inputs) + self.inputs) + torch_res, torch_qps, torch_time_consume = \ + test_helper.run_model(torch_model, use_cuda, num_iter) + + print( + f"DistrillFFN \"({batch_size}, {input_len:03})\" ", + f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") + + turbo_res = lambda: self.turbo_ffn(self.inputs, + is_trans_weight=True) + with turbo_transformers.pref_guard("gpref_test") as perf: + turbo_res, turbo_qps, turbo_time_consume = \ + test_helper.run_model(turbo_res, use_cuda, num_iter) + + print( + f"DistrillFFN \"({batch_size}, {input_len:03})\" ", + f"{device} Turbo Trans QPS, {turbo_qps}, time, {turbo_time_consume}" + ) + + print(torch.max(torch.abs(torch_res - turbo_res))) + self.assertTrue(torch.max(torch.abs(torch_res - turbo_res)) < 1e-3) + + with open(fname, "a") as fh: + fh.write( + f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" + ) + + def test_distrill_ffn(self): + self.check_torch_and_turbo(use_cuda=False) + if torch.cuda.is_available() and \ + turbo_transformers.config.is_compiled_with_cuda(): + self.check_torch_and_turbo(use_cuda=True) + + globals()[f"TestDistillFFN{batch_size}_{input_len:3}"] = TestDistillFFN + + +with open(fname, "w") as fh: + fh.write(", torch, turbo_trans\n") + +for batch_size in [1, 4]: + for input_len in [10, 20, 30, 40, 50]: + create_test(batch_size, input_len) + +if __name__ == '__main__': + unittest.main() diff --git a/turbo_transformers/python/tests/distill_bert_model_test.py b/turbo_transformers/python/tests/distill_bert_model_test.py new file mode 100644 index 00000000..2993bb92 --- /dev/null +++ b/turbo_transformers/python/tests/distill_bert_model_test.py @@ -0,0 +1,111 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +import turbo_transformers + +import unittest +import sys +import torch +import os + +from transformers.modeling_distilbert import DistilBertConfig +from transformers.modeling_distilbert import DistilBertModel + +sys.path.append(os.path.dirname(__file__)) +import test_helper + +fname = "distrill_tbert.txt" + + +def create_test(batch_size, input_len): + class TestDistillBertModel(unittest.TestCase): + def init_data(self, use_cuda): + self.test_device = torch.device('cuda:0') if use_cuda else \ + torch.device('cpu:0') + if not use_cuda: + torch.set_num_threads(4) + turbo_transformers.set_num_threads(4) + + self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, + hidden_dropout_prob=0.0) + + torch.set_grad_enabled(False) + self.torch_model = DistilBertModel(self.cfg) + self.torch_model.eval() + if use_cuda: + self.torch_model.to(self.test_device) + + self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch( + self.torch_model) + # (batch_size, input_len, model_dim) + self.inputs = torch.randint(low=0, + high=self.cfg.vocab_size - 1, + size=(batch_size, input_len), + dtype=torch.long, + device=self.test_device) + self.attention_mask = torch.ones((batch_size, input_len), + dtype=torch.long, + device=self.test_device) + self.head_mask = [None] * self.cfg.num_hidden_layers + + def check_torch_and_turbo(self, use_cuda, num_iter=1): + self.init_data(use_cuda) + device = "GPU" if use_cuda else "CPU" + + torch_model = lambda: self.torch_model(self.inputs, self. + attention_mask) + torch_res, torch_qps, torch_time_consume = \ + test_helper.run_model(torch_model, use_cuda, num_iter) + + print( + f"DistillBertModel \"({batch_size}, {input_len:03})\" ", + f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") + + turbo_res = lambda: self.turbo_transformer( + self.inputs, self.attention_mask, head_mask=self.head_mask) + with turbo_transformers.pref_guard("gpref_test") as perf: + turbo_res, turbo_qps, turbo_time_consume = \ + test_helper.run_model(turbo_res, use_cuda, num_iter) + + print( + f"DistillBertModel \"({batch_size}, {input_len:03})\" ", + f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}") + + self.assertTrue( + torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2 + if use_cuda else 1e-3) + + with open(fname, "a") as fh: + fh.write( + f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" + ) + + def test_distrill_bert_model(self): + self.check_torch_and_turbo(use_cuda=False) + if torch.cuda.is_available() and \ + turbo_transformers.config.is_compiled_with_cuda(): + self.check_torch_and_turbo(use_cuda=True) + + globals( + )[f"TestDistillTBertModel{batch_size}_{input_len:3}"] = TestDistillBertModel + + +with open(fname, "w") as fh: + fh.write(", torch, turbo_trans\n") + +for batch_size in [4]: + for input_len in [10]: + create_test(batch_size, input_len) + +if __name__ == '__main__': + unittest.main() diff --git a/turbo_transformers/python/tests/distill_transformer_block_test.py b/turbo_transformers/python/tests/distill_transformer_block_test.py new file mode 100644 index 00000000..81cf3968 --- /dev/null +++ b/turbo_transformers/python/tests/distill_transformer_block_test.py @@ -0,0 +1,109 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +import turbo_transformers + +import unittest +import sys +import torch +import os + +from transformers.modeling_distilbert import DistilBertConfig +from transformers.modeling_distilbert import TransformerBlock as DistilTransformerBlock + +sys.path.append(os.path.dirname(__file__)) +import test_helper + +fname = "distrill_transformer_block.txt" + + +def create_test(batch_size, input_len): + class TestDistillTransformerBlock(unittest.TestCase): + def init_data(self, use_cuda): + self.test_device = torch.device('cuda:0') if use_cuda else \ + torch.device('cpu:0') + if not use_cuda: + torch.set_num_threads(4) + turbo_transformers.set_num_threads(4) + + self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, + hidden_dropout_prob=0.0) + + torch.set_grad_enabled(False) + self.torch_transformer_block = DistilTransformerBlock(self.cfg) + self.torch_transformer_block.eval() + if use_cuda: + self.torch_transformer_block.to(self.test_device) + + self.turbo_transformer_block = turbo_transformers.DistrillTransformerBlock.from_torch( + self.torch_transformer_block) + # (batch_size, input_len, model_dim) + self.attention_mask = torch.ones((batch_size, input_len), + dtype=torch.float32, + device=self.test_device) + + self.inputs = torch.rand(size=(batch_size, input_len, + self.cfg.dim), + dtype=torch.float32, + device=self.test_device) + + def check_torch_and_turbo(self, use_cuda, num_iter=1): + self.init_data(use_cuda) + device = "GPU" if use_cuda else "CPU" + + torch_model = lambda: self.torch_transformer_block( + self.inputs, self.attention_mask) + torch_res, torch_qps, torch_time_consume = \ + test_helper.run_model(torch_model, use_cuda, num_iter) + + print( + f"DistrillTransformerBlock \"({batch_size}, {input_len:03})\" ", + f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") + + turbo_res = lambda: self.turbo_transformer_block( + self.inputs, self.attention_mask) + with turbo_transformers.pref_guard("gpref_test") as perf: + turbo_res, turbo_qps, turbo_time_consume = \ + test_helper.run_model(turbo_res, use_cuda, num_iter) + + print( + f"DistrillTransformerBlock \"({batch_size}, {input_len:03})\" ", + f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}") + + self.assertTrue( + torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-3) + + with open(fname, "a") as fh: + fh.write( + f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" + ) + + def test_distrill_transformer_block(self): + self.check_torch_and_turbo(use_cuda=False) + if torch.cuda.is_available() and \ + turbo_transformers.config.is_compiled_with_cuda(): + self.check_torch_and_turbo(use_cuda=True) + + globals( + )[f"TestDistillTransformerBlock{batch_size}_{input_len:3}"] = TestDistillTransformerBlock + + +with open(fname, "w") as fh: + fh.write(", torch, turbo_trans\n") + +for batch_size in [1, 4]: + for input_len in [10, 20, 30, 40, 50]: + create_test(batch_size, input_len) + +if __name__ == '__main__': + unittest.main() diff --git a/turbo_transformers/python/tests/distill_transformer_test.py b/turbo_transformers/python/tests/distill_transformer_test.py new file mode 100644 index 00000000..bb0919f4 --- /dev/null +++ b/turbo_transformers/python/tests/distill_transformer_test.py @@ -0,0 +1,112 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +import turbo_transformers + +import unittest +import sys +import torch +import os + +from transformers.modeling_distilbert import DistilBertConfig +from transformers.modeling_distilbert import Transformer as DistilTransformer + +sys.path.append(os.path.dirname(__file__)) +import test_helper + +fname = "distrill_transformer.txt" + + +def create_test(batch_size, input_len): + class TestDistillTransformer(unittest.TestCase): + def init_data(self, use_cuda): + self.test_device = torch.device('cuda:0') if use_cuda else \ + torch.device('cpu:0') + if not use_cuda: + torch.set_num_threads(4) + turbo_transformers.set_num_threads(4) + + self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, + hidden_dropout_prob=0.0) + + torch.set_grad_enabled(False) + self.torch_transformer = DistilTransformer(self.cfg) + self.torch_transformer.eval() + if use_cuda: + self.torch_transformer.to(self.test_device) + + self.turbo_transformer = turbo_transformers.DistrillTransformer.from_torch( + self.torch_transformer) + # (batch_size, input_len, model_dim) + self.attention_mask = torch.ones((batch_size, input_len), + dtype=torch.float32, + device=self.test_device) + + self.inputs = torch.rand(size=(batch_size, input_len, + self.cfg.dim), + dtype=torch.float32, + device=self.test_device) + self.head_mask = [None] * self.cfg.num_hidden_layers + + def check_torch_and_turbo(self, use_cuda, num_iter=1): + self.init_data(use_cuda) + device = "GPU" if use_cuda else "CPU" + + torch_model = lambda: self.torch_transformer( + self.inputs, self.attention_mask, head_mask=self.head_mask) + torch_res, torch_qps, torch_time_consume = \ + test_helper.run_model(torch_model, use_cuda, num_iter) + + print( + f"DistillBertTransformer \"({batch_size}, {input_len:03})\" ", + f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") + + turbo_res = lambda: self.turbo_transformer( + self.inputs, self.attention_mask, head_mask=self.head_mask) + with turbo_transformers.pref_guard("gpref_test") as perf: + turbo_res, turbo_qps, turbo_time_consume = \ + test_helper.run_model(turbo_res, use_cuda, num_iter) + + print( + f"DistillBertTransformer \"({batch_size}, {input_len:03})\" ", + f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}") + + print(torch.max(torch.abs(torch_res[0] - turbo_res[0]))) + self.assertTrue( + torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2 + if use_cuda else 1e-3) + + with open(fname, "a") as fh: + fh.write( + f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" + ) + + def test_distrill_transformer(self): + self.check_torch_and_turbo(use_cuda=False) + if torch.cuda.is_available() and \ + turbo_transformers.config.is_compiled_with_cuda(): + self.check_torch_and_turbo(use_cuda=True) + + globals( + )[f"TestDistillTransformer{batch_size}_{input_len:3}"] = TestDistillTransformer + + +with open(fname, "w") as fh: + fh.write(", torch, turbo_trans\n") + +for batch_size in [1, 4]: + for input_len in [10, 20, 30, 40, 50]: + create_test(batch_size, input_len) + +if __name__ == '__main__': + unittest.main() diff --git a/turbo_transformers/python/turbo_transformers/layers/__init__.py b/turbo_transformers/python/turbo_transformers/layers/__init__.py index d539aa53..a3807cde 100644 --- a/turbo_transformers/python/turbo_transformers/layers/__init__.py +++ b/turbo_transformers/python/turbo_transformers/layers/__init__.py @@ -19,6 +19,7 @@ from .modeling_decoder import MultiHeadedAttention, PositionwiseFeedForward, TransformerDecoderLayer, TransformerDecoder from .modeling_roberta import RobertaModel from .modeling_gpt2 import GPT2Model +from .modeling_distillbert import DistillBertAttention, DistrillFFN, DistrillTransformerBlock, DistrillTransformer, DistilBertModel from .return_type import ReturnType @@ -33,5 +34,7 @@ 'AlbertAttention', 'AlbertTransformer', 'AlbertModel', 'PositionwiseFeedForward', 'TransformerDecoderLayer', 'TransformerDecoder', 'RobertaModel', 'QBertIntermediate', 'QBertOutput', 'QBertLayer', - 'QBertEncoder', 'QBertModel', 'GPT2Model' + 'QBertEncoder', 'QBertModel', 'GPT2Model', 'DistillBertAttention', + 'DistrillFFN', 'DistrillTransformerBlock', 'DistrillTransformer', + 'DistilBertModel' ] diff --git a/turbo_transformers/python/turbo_transformers/layers/modeling_distillbert.py b/turbo_transformers/python/turbo_transformers/layers/modeling_distillbert.py new file mode 100644 index 00000000..2a9fa8c9 --- /dev/null +++ b/turbo_transformers/python/turbo_transformers/layers/modeling_distillbert.py @@ -0,0 +1,314 @@ +# Copyright (C) 2020 THL A29 Limited, a Tencent company. +# All rights reserved. +# Licensed under the BSD 3-Clause License (the "License"); you may +# not use this file except in compliance with the License. You may +# obtain a copy of the License at +# https://opensource.org/licenses/BSD-3-Clause +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# See the AUTHORS file for names of contributors. + +try: + # `turbo_transformers_cxxd` is the name on debug mode + import turbo_transformers.turbo_transformers_cxxd as cxx +except ImportError: + import turbo_transformers.turbo_transformers_cxx as cxx +from typing import Union, Optional, Sequence +import torch +from .return_type import convert_returns_as_type, ReturnType +from .utils import try_convert, convert2tt_tensor, to_param_dict_convert_tt, to_param_dict, create_empty_if_none, AnyTensor + +from transformers.modeling_distilbert import DistilBertConfig +from transformers.modeling_distilbert import MultiHeadSelfAttention as TorchDistilMultiHeadSelfAttention +from transformers.modeling_distilbert import FFN as TorchDistilFFN +from transformers.modeling_distilbert import TransformerBlock as TorchDistilTransformerBlock +from transformers.modeling_distilbert import Transformer as TorchDistilTransformer +from transformers.modeling_distilbert import Embeddings as TorchDistrilEmbeddings +from transformers.modeling_distilbert import DistilBertModel as TorchDistilBertModel + +from torch import nn +import numpy as np +__all__ = [ + 'DistillBertAttention', 'DistrillFFN', 'DistrillTransformerBlock', + 'DistrillTransformer', 'DistilBertModel' +] + + +class DistillBertAttention(cxx.BertAttention): + def __call__(self, + input_tensor: AnyTensor, + attention_mask: Optional[AnyTensor] = None, + head_mask: Optional[AnyTensor] = None, + output_attentions: Optional[bool] = False, + return_type: Optional[ReturnType] = None, + is_trans_weight: Optional[cxx.Tensor] = False): + assert (head_mask is None) + # attention mask is different from BERT + if attention_mask is not None: + attention_mask = attention_mask[:, None, None, :] + attention_mask = ( + 1.0 - attention_mask) * -10000.0 #-float("inf") will cause NAN + + input_tensor = try_convert(input_tensor) + attention_mask = try_convert(create_empty_if_none(attention_mask)) + context_layer = cxx.Tensor.create_empty() + attn_probs = cxx.Tensor.create_empty() + super(DistillBertAttention, + self).__call__(input_tensor, attention_mask, context_layer, + attn_probs, is_trans_weight) + outputs = (convert_returns_as_type(context_layer, return_type), + convert_returns_as_type(attn_probs, ReturnType.TORCH) + ) if output_attentions else (convert_returns_as_type( + context_layer, return_type), ) + return outputs + + @staticmethod + def from_torch(attention: TorchDistilMultiHeadSelfAttention, + layernorm: nn.LayerNorm): + params = {k: v for k, v in attention.named_parameters()} + layernorm_params = {k: v for k, v in layernorm.named_parameters()} + + with torch.no_grad(): + # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight + qkv_weight = torch.clone( + torch.t( + torch.cat((params['q_lin.weight'], params['k_lin.weight'], + params['v_lin.weight']), + 0).contiguous()).contiguous()) + qkv_bias = torch.cat((params['q_lin.bias'], params['k_lin.bias'], + params['v_lin.bias']), 0).contiguous() + + output_weight = torch.clone( + torch.t(params['out_lin.weight']).contiguous()) + att = DistillBertAttention( + convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias), + convert2tt_tensor(output_weight), + convert2tt_tensor(params['out_lin.bias']), + convert2tt_tensor(layernorm_params['weight']), + convert2tt_tensor(layernorm_params['bias']), attention.n_heads) + + return att + + +class DistrillFFN(cxx.DistrillFFN): + def __call__( + self, + input_tensor: AnyTensor, + return_type: Optional[ReturnType] = None, + is_trans_weight: Optional[bool] = True, #Intel 61xx True is faster + output: Optional[cxx.Tensor] = None): + input_tensor = try_convert(input_tensor) + output = create_empty_if_none(output) + super(DistrillFFN, self).__call__(input_tensor, output, + is_trans_weight) + return convert_returns_as_type(output, return_type) + + @staticmethod + def from_torch(ffn: TorchDistilFFN, + layernorm: nn.LayerNorm, + is_trans_weight: Optional[bool] = True): + ffn_params = {k: v for k, v in ffn.named_parameters()} + layernorm_params = {k: v for k, v in layernorm.named_parameters()} + + # Note that torch's weights of linear layer is transposed + if is_trans_weight: + w_1 = convert2tt_tensor(ffn_params['lin1.weight']) + w_2 = convert2tt_tensor(ffn_params['lin2.weight']) + else: + w_1 = convert2tt_tensor( + torch.clone(torch.t(ffn_params['lin1.weight']).contiguous())) + w_2 = convert2tt_tensor( + torch.clone(torch.t(ffn_params['lin2.weight']).contiguous())) + + with torch.no_grad(): + ffn = DistrillFFN(w_1, convert2tt_tensor(ffn_params['lin1.bias']), + w_2, convert2tt_tensor(ffn_params['lin2.bias']), + convert2tt_tensor(layernorm_params['weight']), + convert2tt_tensor(layernorm_params['bias'])) + return ffn + + +class DistrillTransformerBlock: + def __init__(self, attn: DistillBertAttention, ffn: DistrillFFN): + self.attention = attn + self.ffn = ffn + + def __call__(self, + hidden_states: AnyTensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions=False, + return_type: Optional[ReturnType] = None): + hidden_states = try_convert(hidden_states) + + sa_output = self.attention(hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + return_type=ReturnType.turbo_transformers) + if output_attentions: + sa_output, sa_weights = sa_output + else: + sa_output = sa_output[0] + ffn_output = self.ffn(sa_output) + output = (ffn_output, ) + if output_attentions: + output = (sa_weights, ) + output + return output + + @staticmethod + def from_torch(layer: TorchDistilTransformerBlock): + return DistrillTransformerBlock( + DistillBertAttention.from_torch(layer.attention, + layer.sa_layer_norm), + DistrillFFN.from_torch(layer.ffn, layer.output_layer_norm)) + + +class DistrillTransformer: + def __init__(self, blocks: Sequence[DistrillTransformerBlock]): + self.blocks = blocks + + def __call__(self, + hidden_states: AnyTensor, + attention_mask: Optional[AnyTensor] = None, + head_mask: Optional[AnyTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_type: Optional[ReturnType] = ReturnType.TORCH): + all_hidden_states = () + all_attentions = () + hidden_states = try_convert(hidden_states) + for l in self.blocks: + layer_outputs = l(hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + return_type=ReturnType.turbo_transformers) + if output_hidden_states: + all_hidden_states = all_hidden_states + ( + convert_returns_as_type(hidden_states, ReturnType.TORCH), ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1], ) + + # outputs = (convert_returns_as_type(hidden_states, return_type), ) + outputs = (hidden_states, ) + # Add last layer + if output_hidden_states: + # TODO(jiaruifang)two return value use the same memory space, that is not supported in dlpack. + # So we do not append the last hidden_state at the buttom of all_hidden_states, + # User should use outputs[0] if necessary + # all_hidden_states = all_hidden_states + (convert_returns_as_type(hidden_states, ReturnType.TORCH),) + pass + + if output_hidden_states: + outputs = outputs + (all_hidden_states, ) + if output_attentions: + outputs = outputs + (all_attentions, ) + + return outputs + + @staticmethod + def from_torch(transform: TorchDistilTransformer): + blocks = [ + DistrillTransformerBlock.from_torch(l) for l in transform.layer + ] + return DistrillTransformer(blocks) + + +class DistilBertModel: + def __init__(self, + embeddings_onnxmodel_variant, + transformer: DistrillTransformer, + backend="turbo"): + if backend == "turbo": + self.embeddings = embeddings_onnxmodel_variant + self.transformer = transformer + self.backend = "turbo" + elif backend == "onnxrt": + self.onnxmodel = embeddings_onnxmodel_variant + self.backend = "onnxrt" + + def __call__(self, + input_ids: AnyTensor, + attention_masks: Optional[AnyTensor] = None, + token_type_ids: Optional[AnyTensor] = None, + position_ids: Optional[AnyTensor] = None, + head_mask: Optional[AnyTensor] = None, + inputs_embeds: Optional[AnyTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_type: Optional[ReturnType] = None): + if self.backend == "onnxrt": + if attention_masks is None: + attention_masks = np.ones(input_ids.size(), dtype=np.int64) + else: + attention_masks = attention_masks.cpu().numpy() + data = [input_ids.cpu().numpy(), attention_masks] + outputs = self.onnxmodel.run(inputs=data) + for idx, item in enumerate(outputs): + outputs[idx] = torch.tensor(item, device=input_ids.device) + return outputs + elif self.backend == "turbo": + # torch part + inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) + inputs_embeds = try_convert(inputs_embeds) + + # turbo part + transformer_outputs = self.transformer( + hidden_states=inputs_embeds, + attention_mask=attention_masks, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_type=return_type) + return transformer_outputs + + @staticmethod + def from_torch(model: TorchDistilBertModel, backend="turbo"): + """ + :param model: a torch distrilBert Model + backend: turbo or onnxrt + move model to gpu before call this function. + """ + if backend == "turbo": + transformer = DistrillTransformer.from_torch(model.transformer) + return DistilBertModel(model.embeddings, transformer, "turbo") + elif backend == "onnxrt": + import onnx + import onnxruntime.backend + device = model.device + if 'cuda' in device.type and torch.cuda.is_available(): + use_gpu = True + else: + use_gpu = False + inputs = { + 'input_ids': + torch.randint(32, [2, 32], dtype=torch.long).to( + device), # list of numerical ids for the tokenised text + 'attention_mask': + torch.ones([2, 32], + dtype=torch.long).to(device), # dummy list of ones + } + onnx_model_path = "/tmp/temp_turbo_onnx.model" + with open(onnx_model_path, 'wb') as outf: + torch.onnx.export( + model=model, + args=(inputs['input_ids'], inputs['attention_mask'] + ), # model input (or a tuple for multiple inputs) + f=outf, + input_names=['input_ids', 'attention_mask'], + output_names=['output'], + dynamic_axes={ + 'input_ids': [0, 1], + 'attention_mask': [0, 1] + }) + onnx_model = onnx.load_model(f=onnx_model_path) + onnx_model = onnxruntime.backend.prepare( + model=onnx_model, + device='GPU' if use_gpu else "CPU", + graph_optimization_level=onnxruntime.GraphOptimizationLevel. + ORT_ENABLE_ALL) + return DistilBertModel(onnx_model, None, "onnxrt")