Merge branch 'upstream/main'

fishingguy456 · Jan 16, 2024 · dbfcc56 · dbfcc56
2 parents dcc6a45 + 53922fc
commit dbfcc56
Show file tree

Hide file tree

Showing 34 changed files with 2,364 additions and 734 deletions.
diff --git a/.github/scripts/run_tests.py b/.github/scripts/run_tests.py
@@ -2,9 +2,8 @@
 import json
 import subprocess
 import pathlib
-import numpy as np
-import tqdm
-from db_utils import get_db_conn
+import argparse
+from tabulate import tabulate
 
 external_models = ['llama-7b', 'gpt2']
 
@@ -21,15 +20,7 @@ def run_command(cmd):
         raise RuntimeError(f'Command {cmd} failed with return code {ret}.')
     return stdout
 
-def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype):
-    # Get the name of the benchmark script from DB
-    conn = get_db_conn()
-    cursor = conn.cursor()
-    query = f'SELECT runfile FROM {run_type} WHERE id = {run_id}'
-    cursor.execute(query)
-    runfile = cursor.fetchall()[0][0]
-    cursor.close()
-    conn.close()
+def get_bench_cmd(run_type, run_id, run_name, runfile, run_param_name, dtype):
     if run_name in external_models:
         runfile = './models/bench/' + runfile
     else:
@@ -38,29 +29,47 @@ def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype):
     return cmd
 
 if __name__ == '__main__':
-    fh = open('run_configs.json')
+    parser = argparse.ArgumentParser(prog='Run Benchmarks')
+    parser.add_argument(
+        '--print',
+        action='store_true',
+        default=False,
+        help='Print results'
+    )
+    parser.add_argument(
+        '--configs',
+        type=str,
+        default='run_configs.json',
+        help='Specify configurations file to use for benchmarking'
+    )
+    args = parser.parse_args()
+    configs_file = args.configs
+    fh = open(configs_file)
     run_configs = json.load(fh)
     fh.close()
     hw_config = os.environ.get('HW_CONFIG')
-    print('hw:', hw_config)
     for run_config in run_configs:
         # Append hardware_config column
         run_config['hardware_config'] = hw_config
         # Extract configurations
         run_type = run_config['type']
         run_id = run_config['id']
         run_name = run_config['name']
+        runfile = run_config['runfile']
         run_param_id = run_config['param_id']
         run_param_name = run_config['param_name']
         run_dtype_id = run_config['dtype_id']
         run_dtype_name = run_config['dtype_name']
-        cmd = get_bench_cmd(run_type, run_id, run_name, run_param_name, run_dtype_name)
+        cmd = get_bench_cmd(run_type, run_id, run_name, runfile, run_param_name, run_dtype_name)
         outputs = run_command(cmd)
         if outputs:
             # The second last line of All benchmark scripts' stdout is the latency. (Last line is empty)
             latency = float(outputs.split('\n')[-2])
             run_config['latency'] = latency
         else:
             run_config['latency'] = 999.99
-    with open('run_configs.json', 'w') as fh:
-        json.dump(run_configs, fh)
+    with open(configs_file, 'w') as fh:
+        json.dump(run_configs, fh)
+
+    if args.print:
+       print(tabulate(run_configs, headers="keys")) 
diff --git a/.github/scripts/start_instances.py b/.github/scripts/start_instances.py
@@ -45,7 +45,7 @@ def run_command(cmd):
 
     # Fetch the compile server instance ID from DB and add it to list of instances to launch
     query = (
-        'SELECT cloud_provider_id, instance_id, 0 FROM compile_server LIMIT 1'
+        f'SELECT cloud_provider_id, instance_id, 0 FROM compile_server WHERE org = \'{repo_org}\' LIMIT 1'
     )
     cursor.execute(query)
     rows = cursor.fetchall()
@@ -58,7 +58,7 @@ def run_command(cmd):
     # For now, we run all model/input combinations by default
     run_configs = []
     query = (
-        'SELECT model.id as model_id, model.name as model_name, input_parameter.id as param_id, '
+        'SELECT model.id as model_id, model.name as model_name, model.runfile as runfile, input_parameter.id as param_id, '
         'input_parameter.parameter as param_name, dtype.id as dtype_id, dtype.name as dtype_name '
         'FROM model JOIN model_input_parameter ON '
         'model.id = model_input_parameter.model_id JOIN input_parameter ON '
@@ -67,13 +67,13 @@ def run_command(cmd):
     cursor.execute(query)
     rows = cursor.fetchall()
     for row in rows:
-        model_id, model_name, param_id, param_name, dtype_id, dtype_name = row
-        run_configs.append({'type': 'model', 'id': int(model_id), 'name': model_name, 
+        model_id, model_name, model_runfile, param_id, param_name, dtype_id, dtype_name = row
+        run_configs.append({'type': 'model', 'id': int(model_id), 'name': model_name, 'runfile': model_runfile,
                             'param_id': int(param_id), 'param_name': param_name,
                             'dtype_id': int(dtype_id), 'dtype_name': dtype_name,
                             })
     query = (
-        'SELECT operator.id as operator_id, operator.name as operator_name, input_parameter.id as param_id, '
+        'SELECT operator.id as operator_id, operator.name as operator_name, operator.runfile as runfile, input_parameter.id as param_id, '
         'input_parameter.parameter as param_name, dtype.id as dtype_id, dtype.name as dtype_name '
         'FROM operator JOIN operator_input_parameter ON '
         'operator.id = operator_input_parameter.operator_id JOIN input_parameter ON '
@@ -82,8 +82,8 @@ def run_command(cmd):
     cursor.execute(query)
     rows = cursor.fetchall()
     for row in rows:
-        op_id, op_name, param_id, param_name, dtype_id, dtype_name = row
-        run_configs.append({'type': 'operator', 'id': int(op_id), 'name': op_name, 
+        op_id, op_name, op_runfile, param_id, param_name, dtype_id, dtype_name = row
+        run_configs.append({'type': 'operator', 'id': int(op_id), 'name': op_name, 'runfile': op_runfile,
                             'param_id': int(param_id), 'param_name': param_name,
                             'dtype_id': int(dtype_id), 'dtype_name': dtype_name,
                             })
@@ -99,6 +99,8 @@ def run_command(cmd):
         cloud_provider_id, instance_id, _ = instance
         if cloud_provider_id == 1: # AWS
             cmd = ['aws', 'ec2', 'start-instances', '--instance-ids', instance_id]
+        elif cloud_provider_id == 2: # Always on, no need to launch. Do Nothing.
+            cmd = ['true']
         else:
             raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
         output = run_command(cmd)
@@ -118,6 +120,8 @@ def run_command(cmd):
                     raise RuntimeError(f'Failed to check status for {instance_id} on cloud provider {cloud_provider_id}.')
                 if output.stdout.count('ok') >= 2:
                     started = True
+            elif cloud_provider_id == 2: # Always on, no need to launch. Do Nothing.
+                started = True
             else:
                 raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
 

diff --git a/.github/scripts/stop_instances.py b/.github/scripts/stop_instances.py
@@ -19,6 +19,8 @@ def run_command(cmd):
         instance_id = ids[1]
         if cloud_provider_id == 1: # AWS
             cmd = ['aws', 'ec2', 'stop-instances', '--instance-ids', instance_id]
+        elif cloud_provider_id == 2: # Always on, no need to stop. Do Nothing.
+            cmd = ['true']
         else:
             raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
         output = run_command(cmd)
@@ -42,5 +44,7 @@ def run_command(cmd):
                 # An instance still running would contain its id in the status.
                 if instance_id not in output.stdout:
                     stopped = True
+            elif cloud_provider_id == 2: # Always on, no need to stop. Do Nothing.
+                stopped = True
             else:
                 raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
diff --git a/.github/workflows/regression.yaml b/.github/workflows/regression.yaml
@@ -11,20 +11,6 @@ on:
   issue_comment:
     types: [created]
 
-env:
-  CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
-  CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
-  CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
-  CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
-  CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
-  CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
-  CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
-  CI_CS_PASSWORD: ${{ secrets.CI_CS_PASSWORD }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  AWS_DEFAULT_REGION: us-east-1
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-
 jobs:
   start_instances:
     if: |
@@ -50,6 +36,13 @@ jobs:
           # TODO: Allow launching only specified GPU instances
           HW_CONFIG: all
           REPO_NAME: ${{ github.repository }}
+          CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
+          CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
+          CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
+          CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-1
 
       - name: Upload run configs
         uses: actions/upload-artifact@v3
@@ -110,6 +103,10 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: run_configs
+
+      - name: Clear cache
+        run: |
+          hidet cache clear --all
       
       - name: Run tests
         timeout-minutes: 2880
@@ -121,6 +118,11 @@ jobs:
           REPO_BRANCH: |
             ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
             format('pull/{0}', github.event.issue.number) }}
+          CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
+          CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
+          CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
+          CI_CS_PASSWORD: ${{ secrets.CI_CS_PASSWORD }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Upload run configs
         uses: actions/upload-artifact@v3
@@ -135,10 +137,6 @@ jobs:
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
-        with:
-          ref: |
-            ${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
-            format('refs/pull/{0}/head', github.event.issue.number) }}
 
       - name: Install dependencies
         run: pip install mysql-connector-python
@@ -166,6 +164,10 @@ jobs:
           COMMIT_TIME: ${{ env.COMMIT_TIME }}
           COMMIT_AUTHOR: ${{ env.COMMIT_AUTHOR }}
           HW_CONFIGS: ${{ needs.start_instances.outputs.hw_configs }}
+          CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
+          CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
+          CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
+          CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
 
   stop_instances:
     if: |
@@ -181,4 +183,7 @@ jobs:
       - name: Run main Python script
         run: timeout 900 python ./.github/scripts/stop_instances.py
         env:
-          STARTED_INSTANCES: ${{ needs.start_instances.outputs.started_instances }}
+          STARTED_INSTANCES: ${{ needs.start_instances.outputs.started_instances }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-1
diff --git a/apps/compile_server/resources/compilation.py b/apps/compile_server/resources/compilation.py
@@ -56,7 +56,8 @@ def clone_github_repo(owner: str, repo: str, version: str) -> str:
             branches = repo.git.branch("--all").split()
             # If local branch already exists, delete it as we prepare to do a new fresh checkout
             # This is because the local branch might be divergent with remote, so we just discard it
-            if version in branches:
+            # The exception is the main branch, since it should never diverge
+            if version in branches and version != 'main':
                 repo.git.checkout('main')
                 repo.git.branch('-D', version)
             if 'pull/' in version:

diff --git a/python/hidet/drivers/build_graph.py b/python/hidet/drivers/build_graph.py
@@ -46,6 +46,9 @@ def get_graph_weights(graph):
 
 
 def get_graph_intermediates(graph):
+    """
+    Get the intermediate tensors of the graph: {output tensors of nodes} - {output tensors of the graph}
+    """
     intermediates: List[Tensor] = []
     for node in graph.nodes:
         for y in node.outputs:
@@ -145,7 +148,12 @@ def get_graph_meta_data(graph: FlowGraph, num_kernels, space: int) -> GraphMetaD
     graph_hash = sha256('\n'.join(lines).encode('utf-8')).hexdigest()[:16]
 
     return GraphMetaData(
-        inputs=inputs, outputs=outputs, hidet_version=hidet.__version__, num_kernels=num_kernels, graph_hash=graph_hash
+        inputs=inputs,
+        outputs=outputs,
+        hidet_version=hidet.__version__,
+        num_kernels=num_kernels,
+        graph_hash=graph_hash,
+        share_map=graph.share_map,
     )
 
 
@@ -191,11 +199,15 @@ def get_workspace_size_impl(cpu_size: Var, cuda_size: Var):
             for idx in [cpu_idx, cuda_idx]:
                 sb += memory_planner_init(idx)
             for node in graph_nodes:
-                for y in node.outputs:
+                for output_idx, y in enumerate(node.outputs):
                     if y in graph_intermediates:
-                        sb += DeclareStmt(
-                            tensor_ptr[y], init=memory_planner_allocate(device2idx[y.device.kind], tensor_size[y])
-                        )
+                        if node.share_map and y in node.share_map:
+                            # share the memory with input tensor
+                            input_idx: int = node.share_map[output_idx]
+                            init_addr = tensor_ptr[node.inputs[input_idx]]
+                        else:
+                            init_addr = memory_planner_allocate(device2idx[y.device.kind], tensor_size[y])
+                        sb += DeclareStmt(tensor_ptr[y], init=init_addr)
                 sb += AssignStmt(cpu_size, primitives.max(cpu_size, memory_planner_used(cpu_idx)))
                 sb += AssignStmt(cuda_size, primitives.max(cuda_size, memory_planner_used(cuda_idx)))
                 for x in node.inputs:
@@ -247,11 +259,15 @@ def launch_impl(inputs: List[Var], outputs: List[Var], p_kernels: Var):
                         node_params.append(d2w[x.device.kind] + tensor_ptr[x])
                     else:
                         raise RuntimeError("Unknown tensor {}".format(x))
-                for y in node.outputs:
+                for output_idx, y in enumerate(node.outputs):
                     if y in graph_intermediates:
-                        sb += DeclareStmt(
-                            tensor_ptr[y], init=memory_planner_allocate(d2i[y.device.kind], tensor_size[y])
-                        )
+                        if node.share_map and y in node.share_map:
+                            # share the memory with input tensor
+                            input_idx: int = node.share_map[output_idx]
+                            init_addr = tensor_ptr[node.inputs[input_idx]]
+                        else:
+                            init_addr = memory_planner_allocate(d2i[y.device.kind], tensor_size[y])
+                        sb += DeclareStmt(tensor_ptr[y], init=init_addr)
                         node_params.append(d2w[y.device.kind] + tensor_ptr[y])
                     elif y in graph.outputs:
                         node_params.append(outputs[graph.outputs.index(y)])

diff --git a/python/hidet/drivers/build_task.py b/python/hidet/drivers/build_task.py
@@ -202,6 +202,7 @@ def get_signature(t: TensorNode, device: str) -> TensorSignature:
         symbols=[v.name for v in task.symbols],
         inputs=[get_signature(t, input_device) for t in task.inputs],
         outputs=[get_signature(t, output_device) for t in task.outputs],
+        share_map=task.share_map,
         target=build_target,
         num_candidates=num_candidates,
         hidet_version=hidet.__version__,