Skip to content

Commit

Permalink
Jiaruifang/polish gpu docker build files (#201)
Browse files Browse the repository at this point in the history
1. polish gpu docker building scripts
2. fix benchmark bugs
3. add miss pip (docopt) for dev gpu docker file
4. fix a bert model-aware allocator bug
  • Loading branch information
feifeibear authored Nov 25, 2020
1 parent 68b8f72 commit 055baa2
Show file tree
Hide file tree
Showing 10 changed files with 32 additions and 30 deletions.
19 changes: 10 additions & 9 deletions benchmark/benchmark_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def run_model(model,
batch_size,
seq_len,
framework_name,
num_threads=1):
num_threads=1,
enable_mem_opt=False):
# warm up
import torch
import contexttimer
Expand All @@ -33,15 +34,15 @@ def run_model(model,
start.record()

with contexttimer.Timer() as t:
if enable_mem_opt:
turbo_transformers.bert_opt_mem_allocate_api(
batch_size, # batch
seq_len, # seq_len
model.config.num_attention_heads,
model.config.hidden_size,
model.config.num_hidden_layers,
"GPU" if use_gpu else "CPU")
for it in range(num_iter):
if use_mem_opt:
turbo_transformers.bert_opt_mem_allocate_api(
batch_size, # batch
seq_len, # seq_len
model.config.num_attention_heads,
model.config.hidden_size,
model.config.num_hidden_layers,
"GPU" if use_gpu else "CPU")
model()

if not use_gpu:
Expand Down
3 changes: 2 additions & 1 deletion benchmark/torch_benchmark_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int,
dtype=torch.long,
device=test_device)
benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n,
batch_size, seq_len, "torch", num_threads)
batch_size, seq_len, "torch", num_threads,
enable_mem_opt)
8 changes: 5 additions & 3 deletions tools/docker/Dockerfile_dev.gpu
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
FROM IMAGE_BASE

RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list && apt-get update && \
# RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list && apt-get update && \
RUN apt-get update && \
apt-get install -y curl git ninja-build && rm -rf /var/lib/apt/lists/*

ENV PATH=/opt/miniconda3/bin:${PATH} CONDA_PREFIX=/opt/miniconda3

RUN curl -LO https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
# RUN curl -LO https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -p /opt/miniconda3 -b && \
rm Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
conda install pytorch=PYTORCH_VERSION cudatoolkit=CUDA_VERSION cudnn -c pytorch -y && \
conda install conda-verify conda-build mkl-include cmake ninja -c anaconda -y && \
conda clean -afy

RUN pip install --no-cache-dir OpenNMT-py==1.1.0 onnxruntime-gpu==1.3.0
RUN pip install --no-cache-dir OpenNMT-py==1.1.0 docopt onnxruntime-gpu==1.3.0

# build turbo
RUN mkdir -p /src && cd /src && git clone https://github.com/Tencent/TurboTransformers.git --recursive && cd ./TurboTransformers && \
Expand Down
6 changes: 4 additions & 2 deletions tools/docker/Dockerfile_release.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ FROM DEV_IMAGE

FROM IMAGE_BASE

RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list && apt-get update && \
# RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list && apt-get update && \
RUN apt-get update && \
apt-get install -y curl && rm -rf /var/lib/apt/lists/*

ENV PATH=/opt/miniconda3/bin:${PATH} CONDA_PREFIX=/opt/miniconda3

RUN curl -LO https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
# RUN curl -LO https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -p /opt/miniconda3 -b && \
rm Miniconda3-py37_4.8.3-Linux-x86_64.sh && \
conda install pytorch=PYTORCH_VERSION cudatoolkit=CUDA_VERSION cudnn --freeze-installed -c pytorch && \
Expand Down
7 changes: 2 additions & 5 deletions turbo_transformers/core/allocator/bert_allocator_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

#include <iostream>


#include "catch2/catch.hpp"
#include "turbo_transformers/core/allocator/bert_config.h"
#include "turbo_transformers/core/allocator/model_aware_memory_scheduler.h"
Expand Down Expand Up @@ -83,16 +82,15 @@ TEST_CASE("bert-allocator-multiple-chunk",
REQUIRE(CheckValid(tensor_position_map, bert_tensor_usage_record));
}


TEST_CASE("bert-allocator-multiple-allocation",
"check multi times memory allocation correction") {
std::vector<TensorRecordItemPtr> bert_tensor_usage_record;
std::map<std::string, TensorPositionInfo> tensor_position_map;
ChunkList chunk_list([](size_t size) -> char* { return new char[size]; },
[](void* mem_addr) { free(mem_addr); });

std::vector<int64_t> batch_list{1, 1, 2, 4, 1};
std::vector<int64_t> seq_len_list{10, 100, 32, 500, 10};
std::vector<int64_t> batch_list{2, 1, 2};
std::vector<int64_t> seq_len_list{50, 100, 50};
std::set<std::string> activation_set;
for (size_t i = 0; i < batch_list.size(); ++i) {
LOG_S(INFO) << "begin allocate for batch " << batch_list[i] << " seq_len "
Expand All @@ -106,7 +104,6 @@ TEST_CASE("bert-allocator-multiple-allocation",

chunk_list.ShowChunkUsage();
REQUIRE(CheckValid(tensor_position_map, bert_tensor_usage_record));

}
}

Expand Down
2 changes: 0 additions & 2 deletions turbo_transformers/core/allocator/bert_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ void GetBertTensorUsageRecord(
auto attn_score_size =
batch_size * num_head * from_seq_len * to_seq_len * item_bytes;
auto aligned_id_seq_size = from_seq_len * batch_size * id_bytes;
// auto aligned_id_seq_size =
// (from_seq_len * batch_size + 31) * id_bytes / 32 * 32;

auto extendedattnmask_size = batch_size * from_seq_len * item_bytes;
ADDITEM("PrepareBertMasks/possitionids/Reshape", 0, 1, aligned_id_seq_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ static bool TryFitChunk(
int64_t smallest_gap = std::numeric_limits<int64_t>::max();
bool success = false;
chunk.visit([&](Chunk::ChunkNode* x) {
if (success) return;
auto x_size = x->tensor_record_->size_;
auto x_offset = x->offset_;

Expand Down
10 changes: 6 additions & 4 deletions turbo_transformers/core/allocator/model_aware_memory_scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,13 @@ class Chunk {
const TensorRecordItemPtr tensor_record_;
int64_t offset_;
bool operator<(const ChunkNode& o) const { return offset_ < o.offset_; }
bool operator<=(const ChunkNode& o) const { return offset_ <= o.offset_; }
bool operator>(const ChunkNode& o) const { return offset_ > o.offset_; }
bool operator>=(const ChunkNode& o) const { return offset_ >= o.offset_; }
bool operator<=(const ChunkNode& o) const { return offset_ <= o.offset_; }
};

bool operator<(const Chunk& o) const { return size_ < o.size_; }
bool operator>(const Chunk& o) const { return size_ > o.size_; }
bool operator>=(const Chunk& o) const { return size_ >= o.size_; }
bool operator<=(const Chunk& o) const { return size_ <= o.size_; }

Expand All @@ -75,9 +77,9 @@ class Chunk {
void showMe() {
int64_t max_end_addr = 0;
tensor_info_.visit([&](ChunkNode* node) {
// LOG_S(INFO) << node->tensor_record_->name_ << " "
// << node->tensor_record_->size_ << " " <<
// node->offset_;
// LOG_S(INFO) << node->tensor_record_->name_ << " "
// << node->tensor_record_->size_ << " " <<
// node->offset_;
max_end_addr =
std::max(max_end_addr, node->tensor_record_->size_ + node->offset_);
});
Expand Down
4 changes: 2 additions & 2 deletions turbo_transformers/core/allocator/ordered_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ class OrderedList {
Node* cursor = head_ptr_->next_.get();
while (cursor != nullptr) {
// descending order
if (reverse && *content_ptr >= *cursor->content_) {
if (reverse && *content_ptr > *cursor->content_) {
break;
// ascending order
} else if (!reverse && *content_ptr <= *cursor->content_) {
} else if (!reverse && *content_ptr < *cursor->content_) {
break;
}
prev_node = cursor;
Expand Down
2 changes: 1 addition & 1 deletion turbo_transformers/python/tests/bert_model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def bert_model_test_helper(self, use_memory_opt=False):
turbo_transformers.reset_allocator_schema("naive")

def test_bert_model(self):
# self.bert_model_test_helper(True)
self.bert_model_test_helper(True)
self.bert_model_test_helper(False)


Expand Down

0 comments on commit 055baa2

Please sign in to comment.