Cherry pick (batch 2) to rel-1.5.1 (#5290)

* remove implicit linking of tensorrt and dnnl ep shared libs (#5262) * Update DirectML Nuget to 1.3.0 (#5274) * Update PyTorch TransformerModel sample (#5275) * Insert telemetry template into GPU build, add telemry build switches. (#5278) * Synchronize training dependency versions between Docker image and Python wheel (#5261) * Downgrade GCC (#5269) * Remove --enable_symbolic_shape_infer_tests to fix linux ci pipeline build error. Co-authored-by: Edward Chen Co-authored-by: George Wu <[email protected]> Co-authored-by: Dwayne Robinson <[email protected]> Co-authored-by: Thiago Crepaldi <[email protected]> Co-authored-by: Dmitri Smirnov <[email protected]> Co-authored-by: edgchen1 <[email protected]> Co-authored-by: Changming Sun <[email protected]>
microsoft · Sep 25, 2020 · c00e13a · c00e13a
1 parent 389cca7
commit c00e13a
Show file tree

Hide file tree

Showing 57 changed files with 651 additions and 334 deletions.
diff --git a/BUILD.md b/BUILD.md
@@ -1103,12 +1103,13 @@ Dockerfile instructions are available [here](./dockerfiles#migraphx)
 
 The default NVIDIA GPU build requires CUDA runtime libraries installed on the system:
 
-* CUDA 10.2
-* cuDNN 7.6.5
-* NCCL v2.7.8
-* OpenMPI 4.0.4
+* [CUDA](https://developer.nvidia.com/cuda-toolkit) 10.2
+* [cuDNN](https://developer.nvidia.com/cudnn) 8.0
+* [NCCL](https://developer.nvidia.com/nccl) 2.7
+* [OpenMPI](https://www.open-mpi.org/) 4.0.4
+  * See [install_openmpi.sh](./tools/ci_build/github/linux/docker/scripts/install_openmpi.sh)
 
-The official dependency versions are specified in [Dockerfile.training](./dockerfiles/Dockerfile.training).
+These dependency versions should reflect what is in [Dockerfile.training](./dockerfiles/Dockerfile.training).
 
 ## Build instructions
 

diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
@@ -20,7 +20,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/DirectML.3.0.0)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/DirectML.1.3.0)
   set(DML_SHARED_LIB DirectML.dll)
 
   # Restore nuget packages, which will pull down the DirectML redist package

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -397,9 +397,7 @@ set(ONNXRUNTIME_TEST_LIBS
     ${ONNXRUNTIME_INTEROP_TEST_LIBS}
     ${onnxruntime_libs}
     ${PROVIDERS_CUDA}
-# These providers are shared libraries now, so aren't linked this way anymore:
-    ${PROVIDERS_DNNL}
-    ${PROVIDERS_TENSORRT}
+    # TENSORRT and DNNL are explicitly linked at runtime
     ${PROVIDERS_MIGRAPHX}
     ${PROVIDERS_NGRAPH}
     ${PROVIDERS_OPENVINO}
@@ -433,7 +431,6 @@ if(onnxruntime_USE_TENSORRT)
   list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/tensorrt/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_tensorrt)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_tensorrt onnxruntime_providers_shared)
-  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_tensorrt)
 endif()
 
 if(onnxruntime_USE_NNAPI_BUILTIN)

diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh
@@ -27,6 +27,6 @@ docker run --gpus all --rm \
         -e "PackageName=$PackageName" \
         -e "RunTestCsharp=$RunTestCsharp" \
         -e "RunTestNative=$RunTestNative" \
-        onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimegpubuild:ch35 \
+        onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimecentosgpubuild:ch5h \
         /bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \
         /home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev $CurrentOnnxRuntimeVersion
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh
@@ -35,6 +35,6 @@ docker run --rm \
         -e "DisableMlOps=$DISABLEMLOPS" \
         -e "RunTestCsharp=$RunTestCsharp" \
         -e "RunTestNative=$RunTestNative" \
-        onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimecpubuild:ch36 \
+        onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimecentoscpubuild:ch5g \
         /bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \
         /home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev $CurrentOnnxRuntimeVersion
diff --git a/dockerfiles/Dockerfile.training b/dockerfiles/Dockerfile.training
@@ -14,7 +14,7 @@ ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
 ARG COMMIT=master
 
 # cuda development image for building sources
-FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as builder
+FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 as builder
 
 # set location for builds
 WORKDIR /stage
@@ -155,7 +155,7 @@ RUN pip install azureml-defaults transformers==2.11.0 msgpack==1.0.0 tensorboard
 
 # switch to cuda runtime environment
 # note: launch with --gpus all or nvidia-docker
-FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
+FROM nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04
 WORKDIR /stage
 
 # install ucx

diff --git a/docs/execution_providers/DirectML-ExecutionProvider.md b/docs/execution_providers/DirectML-ExecutionProvider.md
@@ -6,7 +6,7 @@ When used standalone, the DirectML API is a low-level DirectX 12 library and is
 
 The *DirectML Execution Provider* is an optional component of ONNX Runtime that uses DirectML to accelerate inference of ONNX models. The DirectML execution provider is capable of greatly improving evaluation time of models using commodity GPU hardware, without sacrificing broad hardware support or requiring vendor-specific extensions to be installed.
 
-The DirectML Execution Provider currently uses DirectML version 2.1.0.
+The DirectML Execution Provider currently uses DirectML version 1.3.0.
 
 ## Table of contents
 

diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -162,7 +162,57 @@
         "^test_operator_mm",
         "^test_operator_non_float_params",
         "^test_operator_params",
-        "^test_operator_pow"
+        "^test_operator_pow",
+        "^test_nllloss_NC",
+        "^test_nllloss_NCd1",
+        "^test_nllloss_NCd1d2",
+        "^test_nllloss_NCd1d2d3d4d5_none_no_weight",
+        "^test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
+        "^test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
+        "^test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
+        "^test_nllloss_NCd1d2d3_sum_weight_high_ii",
+        "^test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
+        "^test_nllloss_NCd1d2_expanded",
+        "^test_nllloss_NCd1d2_reduction_mean",
+        "^test_nllloss_NCd1d2_reduction_mean_expanded",
+        "^test_nllloss_NCd1d2_reduction_sum",
+        "^test_nllloss_NCd1d2_reduction_sum_expanded",
+        "^test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
+        "^test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
+        "^test_nllloss_NCd1_expanded",
+        "^test_nllloss_NC_expanded",
+        "^test_sce_mean_3d",
+        "^test_sce_mean_3d_expanded",
+        "^test_sce_mean_3d_log_prob",
+        "^test_sce_mean_3d_log_prob_expanded",
+        "^test_sce_mean",
+        "^test_sce_mean_expanded",
+        "^test_sce_mean_log_prob",
+        "^test_sce_mean_log_prob_expanded",
+        "^test_sce_NCd1d2d3d4d5_mean_weight",
+        "^test_sce_NCd1d2d3d4d5_mean_weight_expanded",
+        "^test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
+        "^test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+        "^test_sce_NCd1d2d3d4d5_none_no_weight",
+        "^test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
+        "^test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
+        "^test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+        "^test_sce_NCd1d2d3_none_no_weight_negative_ii",
+        "^test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
+        "^test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
+        "^test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
+        "^test_sce_NCd1d2d3_sum_weight_high_ii",
+        "^test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
+        "^test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
+        "^test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
+        "^test_sce_none",
+        "^test_sce_none_expanded",
+        "^test_sce_none_log_prob",
+        "^test_sce_none_log_prob_expanded",
+        "^test_sce_sum",
+        "^test_sce_sum_expanded",
+        "^test_sce_sum_log_prob",
+        "^test_sce_sum_log_prob_expanded"
     ],
     "unsupported_usages": [
         "^test_convtranspose_1d", // ConvTransponse supports 4-D only
@@ -178,4 +228,4 @@
         "^test_bitshift_right_uint16",
         "^test_bitshift_left_uint16"
     ]
-}
+}
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -289,7 +289,8 @@ TEST(GradientCheckerTest, SubGrad) {
   TestBroadcastableBinaryOpGrad("Sub");
 }
 
-TEST(GradientCheckerTest, MulGrad) {
+//flaky
+TEST(GradientCheckerTest, DISABLED_MulGrad) {
   TestBroadcastableBinaryOpGrad("Mul");
 }
 

diff --git a/packages.config b/packages.config
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="DirectML" version="3.0.0" targetFramework="native" />
+  <package id="DirectML" version="1.3.0" targetFramework="native" />
   <package id="GoogleTestAdapter" version="0.17.1" targetFramework="net46" />
 </packages>
diff --git a/samples/python/pytorch_transformer/README.md b/samples/python/pytorch_transformer/README.md
@@ -10,12 +10,24 @@ This example was adapted from Pytorch's [Sequence-to-Sequence Modeling with nn.T
 
 ## Running PyTorch version
 
-```python
+```bash
 python pt_model.py
 ```
 
 ## Running ONNX Runtime version
 
-```python
+```bash
 python ort_model.py
 ```
+
+## Optional arguments
+
+| Argument          | Description                                             | Default   |
+| :---------------- | :-----------------------------------------------------: | --------: |
+| --batch-size      | input batch size for training                           | 20        |
+| --test-batch-size | input batch size for testing                            | 20        |
+| --epochs          | number of epochs to train                               | 2         |
+| --lr              | learning rate                                           | 0.001     |
+| --no-cuda         | disables CUDA training                                  | False     |
+| --seed            | random seed                                             | 1         |
+| --log-interval    | how many batches to wait before logging training status | 200       |
diff --git a/samples/python/pytorch_transformer/ort_train.py b/samples/python/pytorch_transformer/ort_train.py
@@ -0,0 +1,85 @@
+import argparse
+import math
+import torch
+import onnxruntime
+
+from utils import prepare_data, get_batch
+from ort_utils import my_loss, transformer_model_description_dynamic_axes
+from pt_model import TransformerModel
+
+
+def train(trainer, data_source, device, epoch, args, bptt=35):
+    total_loss = 0.
+    for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt)):
+        data, targets = get_batch(data_source, i)
+
+        loss, pred = trainer.train_step(data, targets)
+        total_loss += loss.item()
+        if batch % args.log_interval == 0 and batch > 0:
+            cur_loss = total_loss / args.log_interval
+            print('epoch {:3d} | {:5d}/{:5d} batches | loss {:5.2f}'.format(epoch,
+                                                                            batch,
+                                                                            len(data_source) // bptt,
+                                                                            cur_loss))
+            total_loss = 0
+
+
+def evaluate(trainer, data_source, bptt=35):
+    total_loss = 0.
+    with torch.no_grad():
+        for i in range(0, data_source.size(0) - 1, bptt):
+            data, targets = get_batch(data_source, i)
+            loss, pred = trainer.eval_step(data, targets)
+            total_loss += len(data) * loss.item()
+    return total_loss / (len(data_source) - 1)
+
+
+if __name__ == "__main__":
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch TransformerModel example')
+    parser.add_argument('--batch-size', type=int, default=20, metavar='N',
+                        help='input batch size for training (default: 20)')
+    parser.add_argument('--test-batch-size', type=int, default=20, metavar='N',
+                        help='input batch size for testing (default: 20)')
+    parser.add_argument('--epochs', type=int, default=2, metavar='N',
+                        help='number of epochs to train (default: 2)')
+    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+                        help='learning rate (default: 0.001)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=200, metavar='N',
+                        help='how many batches to wait before logging training status (default: 200)')
+
+    # Basic setup
+    args = parser.parse_args()
+    if not args.no_cuda and torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    torch.manual_seed(args.seed)
+    onnxruntime.set_seed(args.seed)
+
+    # Model
+    optim_config = onnxruntime.training.optim.SGDConfig(lr=args.lr)
+    model_desc = transformer_model_description_dynamic_axes()
+    model = TransformerModel(28785, 200, 2, 200, 2, 0.2).to(device)
+
+    # Preparing data
+    train_data, val_data, test_data = prepare_data(device, args.batch_size, args.test_batch_size)
+    trainer = onnxruntime.training.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss)
+
+    # Train
+    for epoch in range(1, args.epochs + 1):
+        train(trainer, train_data, device, epoch, args)
+        val_loss = evaluate(trainer, val_data)
+        print('-' * 89)
+        print('| end of epoch {:3d} | valid loss {:5.2f} | '.format(epoch, val_loss))
+        print('-' * 89)
+
+    # Evaluate
+    test_loss = evaluate(trainer, test_data)
+    print('=' * 89)
+    print('| End of training | test loss {:5.2f}'.format(test_loss))
+    print('=' * 89)
diff --git a/samples/python/pytorch_transformer/pt_train.py b/samples/python/pytorch_transformer/pt_train.py
@@ -0,0 +1,92 @@
+import argparse
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils import prepare_data, get_batch
+from pt_model import TransformerModel
+
+
+def train(model, data_source, device, epoch, args, bptt=35):
+    total_loss = 0.
+    model.train()
+    for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt)):
+        data, targets = get_batch(data_source, i)
+
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output.view(-1, 28785), targets)
+        loss.backward()
+        optimizer.step()
+
+        total_loss += loss.item()
+        if batch % args.log_interval == 0 and batch > 0:
+            cur_loss = total_loss / args.log_interval
+            print('epoch {:3d} | {:5d}/{:5d} batches | loss {:5.2f}'.format(epoch,
+                                                                            batch,
+                                                                            len(data_source) // bptt,
+                                                                            cur_loss))
+            total_loss = 0
+
+
+def evaluate(model, data_source, criterion, bptt=35):
+    total_loss = 0.
+    model.eval()
+    with torch.no_grad():
+        for i in range(0, data_source.size(0) - 1, bptt):
+            data, targets = get_batch(data_source, i)
+            output = model(data)
+            output_flat = output.view(-1, 28785)
+            total_loss += len(data) * criterion(output_flat, targets).item()
+    return total_loss / (len(data_source) - 1)
+
+
+if __name__ == "__main__":
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch TransformerModel example')
+    parser.add_argument('--batch-size', type=int, default=20, metavar='N',
+                        help='input batch size for training (default: 20)')
+    parser.add_argument('--test-batch-size', type=int, default=20, metavar='N',
+                        help='input batch size for testing (default: 20)')
+    parser.add_argument('--epochs', type=int, default=2, metavar='N',
+                        help='number of epochs to train (default: 2)')
+    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+                        help='learning rate (default: 0.001)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=200, metavar='N',
+                        help='how many batches to wait before logging training status (default: 200)')
+
+    # Basic setup
+    args = parser.parse_args()
+    if not args.no_cuda and torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    torch.manual_seed(args.seed)
+
+    # Model
+    criterion = nn.CrossEntropyLoss()
+    lr = 0.001
+    model = TransformerModel(28785, 200, 2, 200, 2, 0.2).to(device)
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+
+    # Preparing data
+    train_data, val_data, test_data = prepare_data(device, args.batch_size, args.test_batch_size)
+
+    # Train
+    for epoch in range(1, args.epochs + 1):
+        train(model, train_data, device, epoch, args)
+        val_loss = evaluate(model, val_data, criterion)
+        print('-' * 89)
+        print('| end of epoch {:3d} | valid loss {:5.2f} | '.format(epoch, val_loss))
+        print('-' * 89)
+
+    # Evaluate
+    test_loss = evaluate(model, test_data, criterion)
+    print('=' * 89)
+    print('| End of training | test loss {:5.2f}'.format(test_loss))
+    print('=' * 89)