Merge branch 'hidet-org:main' into main

BolinSNLHM · Jan 10, 2024 · d33093e · d33093e
2 parents ef57171 + 873d3a1
commit d33093e
Show file tree

Hide file tree

Showing 91 changed files with 4,014 additions and 548 deletions.
diff --git a/.github/scripts/run_tests.py b/.github/scripts/run_tests.py
@@ -2,9 +2,8 @@
 import json
 import subprocess
 import pathlib
-import numpy as np
-import tqdm
-from db_utils import get_db_conn
+import argparse
+from tabulate import tabulate
 
 external_models = ['llama-7b', 'gpt2']
 
@@ -21,15 +20,7 @@ def run_command(cmd):
         raise RuntimeError(f'Command {cmd} failed with return code {ret}.')
     return stdout
 
-def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype):
-    # Get the name of the benchmark script from DB
-    conn = get_db_conn()
-    cursor = conn.cursor()
-    query = f'SELECT runfile FROM {run_type} WHERE id = {run_id}'
-    cursor.execute(query)
-    runfile = cursor.fetchall()[0][0]
-    cursor.close()
-    conn.close()
+def get_bench_cmd(run_type, run_id, run_name, runfile, run_param_name, dtype):
     if run_name in external_models:
         runfile = './models/bench/' + runfile
     else:
@@ -38,29 +29,47 @@ def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype):
     return cmd
 
 if __name__ == '__main__':
-    fh = open('run_configs.json')
+    parser = argparse.ArgumentParser(prog='Run Benchmarks')
+    parser.add_argument(
+        '--print',
+        action='store_true',
+        default=False,
+        help='Print results'
+    )
+    parser.add_argument(
+        '--configs',
+        type=str,
+        default='run_configs.json',
+        help='Specify configurations file to use for benchmarking'
+    )
+    args = parser.parse_args()
+    configs_file = args.configs
+    fh = open(configs_file)
     run_configs = json.load(fh)
     fh.close()
     hw_config = os.environ.get('HW_CONFIG')
-    print('hw:', hw_config)
     for run_config in run_configs:
         # Append hardware_config column
         run_config['hardware_config'] = hw_config
         # Extract configurations
         run_type = run_config['type']
         run_id = run_config['id']
         run_name = run_config['name']
+        runfile = run_config['runfile']
         run_param_id = run_config['param_id']
         run_param_name = run_config['param_name']
         run_dtype_id = run_config['dtype_id']
         run_dtype_name = run_config['dtype_name']
-        cmd = get_bench_cmd(run_type, run_id, run_name, run_param_name, run_dtype_name)
+        cmd = get_bench_cmd(run_type, run_id, run_name, runfile, run_param_name, run_dtype_name)
         outputs = run_command(cmd)
         if outputs:
             # The second last line of All benchmark scripts' stdout is the latency. (Last line is empty)
             latency = float(outputs.split('\n')[-2])
             run_config['latency'] = latency
         else:
             run_config['latency'] = 999.99
-    with open('run_configs.json', 'w') as fh:
-        json.dump(run_configs, fh)
+    with open(configs_file, 'w') as fh:
+        json.dump(run_configs, fh)
+
+    if args.print:
+       print(tabulate(run_configs, headers="keys")) 
diff --git a/.github/scripts/start_instances.py b/.github/scripts/start_instances.py
@@ -30,7 +30,7 @@ def run_command(cmd):
         hw_config_ids = [s for s in hw_config_ids.split(',') if s]
 
     instances = []
-    # Fetch list of (cloud_provider_id, instance_id) tuples from DB
+    # Fetch list of (cloud_provider_id, instance_id) tuples from DB and add them to the list of instances to launch
     for hw_config_id in hw_config_ids:
         query = (
             'SELECT cloud_provider_id, instance_id, hardware_config.name as hw_config FROM cloud_instance '
@@ -43,12 +43,22 @@ def run_command(cmd):
             raise ValueError(f'Instance with hardware config id {hw_config_id} does not exist.')
         instances.append(rows[0])
 
+    # Fetch the compile server instance ID from DB and add it to list of instances to launch
+    query = (
+        f'SELECT cloud_provider_id, instance_id, 0 FROM compile_server WHERE org = \'{repo_org}\' LIMIT 1'
+    )
+    cursor.execute(query)
+    rows = cursor.fetchall()
+    if len(rows) == 0:
+        raise ValueError(f'No compile server found in DB.')
+    instances.append(rows[0])
+
     # Store a json containing all the required model/OPs (and inputs) for this regression run
     # This json will be uploaded as an artifact, and will be filled in by subsequent jobs
     # For now, we run all model/input combinations by default
     run_configs = []
     query = (
-        'SELECT model.id as model_id, model.name as model_name, input_parameter.id as param_id, '
+        'SELECT model.id as model_id, model.name as model_name, model.runfile as runfile, input_parameter.id as param_id, '
         'input_parameter.parameter as param_name, dtype.id as dtype_id, dtype.name as dtype_name '
         'FROM model JOIN model_input_parameter ON '
         'model.id = model_input_parameter.model_id JOIN input_parameter ON '
@@ -57,13 +67,13 @@ def run_command(cmd):
     cursor.execute(query)
     rows = cursor.fetchall()
     for row in rows:
-        model_id, model_name, param_id, param_name, dtype_id, dtype_name = row
-        run_configs.append({'type': 'model', 'id': int(model_id), 'name': model_name, 
+        model_id, model_name, model_runfile, param_id, param_name, dtype_id, dtype_name = row
+        run_configs.append({'type': 'model', 'id': int(model_id), 'name': model_name, 'runfile': model_runfile,
                             'param_id': int(param_id), 'param_name': param_name,
                             'dtype_id': int(dtype_id), 'dtype_name': dtype_name,
                             })
     query = (
-        'SELECT operator.id as operator_id, operator.name as operator_name, input_parameter.id as param_id, '
+        'SELECT operator.id as operator_id, operator.name as operator_name, operator.runfile as runfile, input_parameter.id as param_id, '
         'input_parameter.parameter as param_name, dtype.id as dtype_id, dtype.name as dtype_name '
         'FROM operator JOIN operator_input_parameter ON '
         'operator.id = operator_input_parameter.operator_id JOIN input_parameter ON '
@@ -72,8 +82,8 @@ def run_command(cmd):
     cursor.execute(query)
     rows = cursor.fetchall()
     for row in rows:
-        op_id, op_name, param_id, param_name, dtype_id, dtype_name = row
-        run_configs.append({'type': 'operator', 'id': int(op_id), 'name': op_name, 
+        op_id, op_name, op_runfile, param_id, param_name, dtype_id, dtype_name = row
+        run_configs.append({'type': 'operator', 'id': int(op_id), 'name': op_name, 'runfile': op_runfile,
                             'param_id': int(param_id), 'param_name': param_name,
                             'dtype_id': int(dtype_id), 'dtype_name': dtype_name,
                             })
@@ -89,6 +99,8 @@ def run_command(cmd):
         cloud_provider_id, instance_id, _ = instance
         if cloud_provider_id == 1: # AWS
             cmd = ['aws', 'ec2', 'start-instances', '--instance-ids', instance_id]
+        elif cloud_provider_id == 2: # Always on, no need to launch. Do Nothing.
+            cmd = ['true']
         else:
             raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
         output = run_command(cmd)
@@ -108,6 +120,8 @@ def run_command(cmd):
                     raise RuntimeError(f'Failed to check status for {instance_id} on cloud provider {cloud_provider_id}.')
                 if output.stdout.count('ok') >= 2:
                     started = True
+            elif cloud_provider_id == 2: # Always on, no need to launch. Do Nothing.
+                started = True
             else:
                 raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
 
@@ -126,7 +140,8 @@ def run_command(cmd):
     hw_configs = []
     for instance in instances:
         _, _, hw_config = instance
-        hw_configs.append(hw_config)
+        if hw_config != 0:
+            hw_configs.append(hw_config)
     hw_config_json_str = json.dumps(hw_configs)
     with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:
         print(f'hw_configs={hw_config_json_str}', file=fh)
diff --git a/.github/scripts/stop_instances.py b/.github/scripts/stop_instances.py
@@ -19,6 +19,8 @@ def run_command(cmd):
         instance_id = ids[1]
         if cloud_provider_id == 1: # AWS
             cmd = ['aws', 'ec2', 'stop-instances', '--instance-ids', instance_id]
+        elif cloud_provider_id == 2: # Always on, no need to stop. Do Nothing.
+            cmd = ['true']
         else:
             raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
         output = run_command(cmd)
@@ -42,5 +44,7 @@ def run_command(cmd):
                 # An instance still running would contain its id in the status.
                 if instance_id not in output.stdout:
                     stopped = True
+            elif cloud_provider_id == 2: # Always on, no need to stop. Do Nothing.
+                stopped = True
             else:
                 raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
diff --git a/.github/workflows/regression.yaml b/.github/workflows/regression.yaml
@@ -11,20 +11,6 @@ on:
   issue_comment:
     types: [created]
 
-env:
-  CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
-  CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
-  CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
-  CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
-  CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
-  CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
-  CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
-  CI_CS_PASSWORD: ${{ secrets.CI_CS_PASSWORD }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  AWS_DEFAULT_REGION: us-east-1
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-
 jobs:
   start_instances:
     if: |
@@ -47,9 +33,16 @@ jobs:
         id: run_py_script
         run: timeout 900 python ./.github/scripts/start_instances.py
         env:
+          # TODO: Allow launching only specified GPU instances
           HW_CONFIG: all
           REPO_NAME: ${{ github.repository }}
-          # TODO: Allow launching only specified GPU instances
+          CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
+          CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
+          CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
+          CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-1
 
       - name: Upload run configs
         uses: actions/upload-artifact@v3
@@ -110,6 +103,10 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: run_configs
+
+      - name: Clear cache
+        run: |
+          hidet cache clear --all
       
       - name: Run tests
         timeout-minutes: 2880
@@ -121,6 +118,11 @@ jobs:
           REPO_BRANCH: |
             ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
             format('pull/{0}', github.event.issue.number) }}
+          CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
+          CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
+          CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
+          CI_CS_PASSWORD: ${{ secrets.CI_CS_PASSWORD }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Upload run configs
         uses: actions/upload-artifact@v3
@@ -135,10 +137,6 @@ jobs:
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
-        with:
-          ref: |
-            ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
-            format('refs/pull/{0}/head', github.event.issue.number) }}
 
       - name: Install dependencies
         run: pip install mysql-connector-python
@@ -166,6 +164,10 @@ jobs:
           COMMIT_TIME: ${{ env.COMMIT_TIME }}
           COMMIT_AUTHOR: ${{ env.COMMIT_AUTHOR }}
           HW_CONFIGS: ${{ needs.start_instances.outputs.hw_configs }}
+          CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
+          CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
+          CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
+          CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
 
   stop_instances:
     if: |
@@ -181,4 +183,7 @@ jobs:
       - name: Run main Python script
         run: timeout 900 python ./.github/scripts/stop_instances.py
         env:
-          STARTED_INSTANCES: ${{ needs.start_instances.outputs.started_instances }}
+          STARTED_INSTANCES: ${{ needs.start_instances.outputs.started_instances }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-1
diff --git a/.gitignore b/.gitignore
@@ -204,3 +204,12 @@ build-release
 
 # intermediate files
 /gallery/**/*.json
+
+# hidet model files
+*.hidet
+
+# lock files
+*.lock
+
+# experiments folder
+/experiments
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,8 +17,10 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
 # add hidet_runtime target
 add_library(hidet_runtime SHARED
-        src/hidet/runtime/cuda_context.cpp
-        src/hidet/runtime/cpu_context.cpp
+        src/hidet/runtime/cuda/context.cpp
+        src/hidet/runtime/cuda/cublas.cpp
+        src/hidet/runtime/cuda/cuda.cpp
+        src/hidet/runtime/cpu/context.cpp
         src/hidet/runtime/callbacks.cpp
         src/hidet/runtime/logging.cpp
         src/hidet/runtime/symbols.cpp
@@ -28,7 +30,7 @@ set_target_properties(hidet_runtime PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_
 
 # add hidet target
 add_library(hidet SHARED
-        src/hidet/packedfunc.cpp
+        src/hidet/empty.cpp  # empty source file
 )
 target_include_directories(hidet PRIVATE ${CMAKE_SOURCE_DIR}/include)
 set_target_properties(hidet PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

diff --git a/apps/compile_server/resources/compilation.py b/apps/compile_server/resources/compilation.py
@@ -56,7 +56,8 @@ def clone_github_repo(owner: str, repo: str, version: str) -> str:
             branches = repo.git.branch("--all").split()
             # If local branch already exists, delete it as we prepare to do a new fresh checkout
             # This is because the local branch might be divergent with remote, so we just discard it
-            if version in branches:
+            # The exception is the main branch, since it should never diverge
+            if version in branches and version != 'main':
                 repo.git.checkout('main')
                 repo.git.branch('-D', version)
             if 'pull/' in version:

diff --git a/docs/source/how-to-guides/add-new-operator/index.rst b/docs/source/how-to-guides/add-new-operator/index.rst
@@ -1,6 +1,8 @@
 Add New Operator
 ================
 
+
+
 Hidet is designed to be extensible. It is easy to add new operators to Hidet. There are two ways to add and schedule
 an operator.
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -36,6 +36,7 @@ Hidet is an open-source DNN inference framework, it features
   :maxdepth: 1
   :caption: Developer Guide
 
+  gallery/developer-guides/add-torch-operator-mapping
   how-to-guides/add-new-operator/index
   gallery/developer-guides/add-operator-resolve-rule
   gallery/developer-guides/add-subgraph-rewrite-rule

diff --git a/docs/source/python_api/data_types.rst b/docs/source/python_api/data_types.rst
@@ -1,6 +1,9 @@
 hidet.dtypes
 ============
 
+Hidet supports the following primitive data types, which can be used as the ``dtype`` parameter of functions like
+:func:`hidet.zeros` and :func:`hidet.ones`:.
+
 .. data:: hidet.uint8
 .. data:: hidet.uint16
 .. data:: hidet.uint32
@@ -14,3 +17,4 @@ hidet.dtypes
 .. data:: hidet.float64
 .. data:: hidet.bfloat16
 .. data:: hidet.tfloat32
+.. data:: hidet.boolean
diff --git a/docs/source/python_api/driver.rst → docs/source/python_api/drivers.rst b/docs/source/python_api/driver.rst → docs/source/python_api/drivers.rst
@@ -2,5 +2,6 @@ hidet.drivers
 -------------
 
 .. automodule:: hidet.drivers
-  :members:
-  :autosummary:
+   :members:
+   :imported-members:
+   :autosummary:
diff --git a/docs/source/python_api/ffi/index.rst b/docs/source/python_api/ffi/index.rst
@@ -0,0 +1,8 @@
+hidet.ffi
+---------
+
+.. automodule:: hidet.ffi
+   :members:
+   :imported-members:
+   :autosummary:
+
diff --git a/docs/source/python_api/index.rst b/docs/source/python_api/index.rst
@@ -15,8 +15,10 @@ Python API
    cuda
    tensor
    data_types
+   drivers
    ops/index
    graph/index
    runtime/index
+   ffi/index
    utils/index
    testing/index
diff --git a/docs/source/python_api/option.rst b/docs/source/python_api/option.rst
@@ -4,3 +4,4 @@ hidet.option
 .. automodule:: hidet.option
   :members:
   :autosummary:
+  :member-order: groupwise