Skip to content

Commit

Permalink
Merge branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
fishingguy456 committed Jan 16, 2024
2 parents dcc6a45 + 53922fc commit dbfcc56
Show file tree
Hide file tree
Showing 34 changed files with 2,364 additions and 734 deletions.
43 changes: 26 additions & 17 deletions .github/scripts/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import json
import subprocess
import pathlib
import numpy as np
import tqdm
from db_utils import get_db_conn
import argparse
from tabulate import tabulate

external_models = ['llama-7b', 'gpt2']

Expand All @@ -21,15 +20,7 @@ def run_command(cmd):
raise RuntimeError(f'Command {cmd} failed with return code {ret}.')
return stdout

def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype):
# Get the name of the benchmark script from DB
conn = get_db_conn()
cursor = conn.cursor()
query = f'SELECT runfile FROM {run_type} WHERE id = {run_id}'
cursor.execute(query)
runfile = cursor.fetchall()[0][0]
cursor.close()
conn.close()
def get_bench_cmd(run_type, run_id, run_name, runfile, run_param_name, dtype):
if run_name in external_models:
runfile = './models/bench/' + runfile
else:
Expand All @@ -38,29 +29,47 @@ def get_bench_cmd(run_type, run_id, run_name, run_param_name, dtype):
return cmd

if __name__ == '__main__':
fh = open('run_configs.json')
parser = argparse.ArgumentParser(prog='Run Benchmarks')
parser.add_argument(
'--print',
action='store_true',
default=False,
help='Print results'
)
parser.add_argument(
'--configs',
type=str,
default='run_configs.json',
help='Specify configurations file to use for benchmarking'
)
args = parser.parse_args()
configs_file = args.configs
fh = open(configs_file)
run_configs = json.load(fh)
fh.close()
hw_config = os.environ.get('HW_CONFIG')
print('hw:', hw_config)
for run_config in run_configs:
# Append hardware_config column
run_config['hardware_config'] = hw_config
# Extract configurations
run_type = run_config['type']
run_id = run_config['id']
run_name = run_config['name']
runfile = run_config['runfile']
run_param_id = run_config['param_id']
run_param_name = run_config['param_name']
run_dtype_id = run_config['dtype_id']
run_dtype_name = run_config['dtype_name']
cmd = get_bench_cmd(run_type, run_id, run_name, run_param_name, run_dtype_name)
cmd = get_bench_cmd(run_type, run_id, run_name, runfile, run_param_name, run_dtype_name)
outputs = run_command(cmd)
if outputs:
# The second last line of All benchmark scripts' stdout is the latency. (Last line is empty)
latency = float(outputs.split('\n')[-2])
run_config['latency'] = latency
else:
run_config['latency'] = 999.99
with open('run_configs.json', 'w') as fh:
json.dump(run_configs, fh)
with open(configs_file, 'w') as fh:
json.dump(run_configs, fh)

if args.print:
print(tabulate(run_configs, headers="keys"))
18 changes: 11 additions & 7 deletions .github/scripts/start_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def run_command(cmd):

# Fetch the compile server instance ID from DB and add it to list of instances to launch
query = (
'SELECT cloud_provider_id, instance_id, 0 FROM compile_server LIMIT 1'
f'SELECT cloud_provider_id, instance_id, 0 FROM compile_server WHERE org = \'{repo_org}\' LIMIT 1'
)
cursor.execute(query)
rows = cursor.fetchall()
Expand All @@ -58,7 +58,7 @@ def run_command(cmd):
# For now, we run all model/input combinations by default
run_configs = []
query = (
'SELECT model.id as model_id, model.name as model_name, input_parameter.id as param_id, '
'SELECT model.id as model_id, model.name as model_name, model.runfile as runfile, input_parameter.id as param_id, '
'input_parameter.parameter as param_name, dtype.id as dtype_id, dtype.name as dtype_name '
'FROM model JOIN model_input_parameter ON '
'model.id = model_input_parameter.model_id JOIN input_parameter ON '
Expand All @@ -67,13 +67,13 @@ def run_command(cmd):
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
model_id, model_name, param_id, param_name, dtype_id, dtype_name = row
run_configs.append({'type': 'model', 'id': int(model_id), 'name': model_name,
model_id, model_name, model_runfile, param_id, param_name, dtype_id, dtype_name = row
run_configs.append({'type': 'model', 'id': int(model_id), 'name': model_name, 'runfile': model_runfile,
'param_id': int(param_id), 'param_name': param_name,
'dtype_id': int(dtype_id), 'dtype_name': dtype_name,
})
query = (
'SELECT operator.id as operator_id, operator.name as operator_name, input_parameter.id as param_id, '
'SELECT operator.id as operator_id, operator.name as operator_name, operator.runfile as runfile, input_parameter.id as param_id, '
'input_parameter.parameter as param_name, dtype.id as dtype_id, dtype.name as dtype_name '
'FROM operator JOIN operator_input_parameter ON '
'operator.id = operator_input_parameter.operator_id JOIN input_parameter ON '
Expand All @@ -82,8 +82,8 @@ def run_command(cmd):
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
op_id, op_name, param_id, param_name, dtype_id, dtype_name = row
run_configs.append({'type': 'operator', 'id': int(op_id), 'name': op_name,
op_id, op_name, op_runfile, param_id, param_name, dtype_id, dtype_name = row
run_configs.append({'type': 'operator', 'id': int(op_id), 'name': op_name, 'runfile': op_runfile,
'param_id': int(param_id), 'param_name': param_name,
'dtype_id': int(dtype_id), 'dtype_name': dtype_name,
})
Expand All @@ -99,6 +99,8 @@ def run_command(cmd):
cloud_provider_id, instance_id, _ = instance
if cloud_provider_id == 1: # AWS
cmd = ['aws', 'ec2', 'start-instances', '--instance-ids', instance_id]
elif cloud_provider_id == 2: # Always on, no need to launch. Do Nothing.
cmd = ['true']
else:
raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
output = run_command(cmd)
Expand All @@ -118,6 +120,8 @@ def run_command(cmd):
raise RuntimeError(f'Failed to check status for {instance_id} on cloud provider {cloud_provider_id}.')
if output.stdout.count('ok') >= 2:
started = True
elif cloud_provider_id == 2: # Always on, no need to launch. Do Nothing.
started = True
else:
raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')

Expand Down
4 changes: 4 additions & 0 deletions .github/scripts/stop_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def run_command(cmd):
instance_id = ids[1]
if cloud_provider_id == 1: # AWS
cmd = ['aws', 'ec2', 'stop-instances', '--instance-ids', instance_id]
elif cloud_provider_id == 2: # Always on, no need to stop. Do Nothing.
cmd = ['true']
else:
raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
output = run_command(cmd)
Expand All @@ -42,5 +44,7 @@ def run_command(cmd):
# An instance still running would contain its id in the status.
if instance_id not in output.stdout:
stopped = True
elif cloud_provider_id == 2: # Always on, no need to stop. Do Nothing.
stopped = True
else:
raise ValueError(f'Unknown cloud provider id: {cloud_provider_id}')
43 changes: 24 additions & 19 deletions .github/workflows/regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,6 @@ on:
issue_comment:
types: [created]

env:
CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
CI_CS_PASSWORD: ${{ secrets.CI_CS_PASSWORD }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-1
HF_TOKEN: ${{ secrets.HF_TOKEN }}

jobs:
start_instances:
if: |
Expand All @@ -50,6 +36,13 @@ jobs:
# TODO: Allow launching only specified GPU instances
HW_CONFIG: all
REPO_NAME: ${{ github.repository }}
CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-1

- name: Upload run configs
uses: actions/upload-artifact@v3
Expand Down Expand Up @@ -110,6 +103,10 @@ jobs:
uses: actions/download-artifact@v3
with:
name: run_configs

- name: Clear cache
run: |
hidet cache clear --all
- name: Run tests
timeout-minutes: 2880
Expand All @@ -121,6 +118,11 @@ jobs:
REPO_BRANCH: |
${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
format('pull/{0}', github.event.issue.number) }}
CI_CS_HOSTNAME: ${{ secrets.CI_CS_HOSTNAME }}
CI_CS_PORT: ${{ secrets.CI_CS_PORT }}
CI_CS_USERNAME: ${{ secrets.CI_CS_USERNAME }}
CI_CS_PASSWORD: ${{ secrets.CI_CS_PASSWORD }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}

- name: Upload run configs
uses: actions/upload-artifact@v3
Expand All @@ -135,10 +137,6 @@ jobs:
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
ref: |
${{ github.event_name == 'workflow_dispatch' && github.ref_name ||
format('refs/pull/{0}/head', github.event.issue.number) }}

- name: Install dependencies
run: pip install mysql-connector-python
Expand Down Expand Up @@ -166,6 +164,10 @@ jobs:
COMMIT_TIME: ${{ env.COMMIT_TIME }}
COMMIT_AUTHOR: ${{ env.COMMIT_AUTHOR }}
HW_CONFIGS: ${{ needs.start_instances.outputs.hw_configs }}
CI_DB_HOSTNAME: ${{ secrets.CI_DB_HOSTNAME }}
CI_DB_PORT: ${{ secrets.CI_DB_PORT }}
CI_DB_USERNAME: ${{ secrets.CI_DB_USERNAME }}
CI_DB_PASSWORD: ${{ secrets.CI_DB_PASSWORD }}

stop_instances:
if: |
Expand All @@ -181,4 +183,7 @@ jobs:
- name: Run main Python script
run: timeout 900 python ./.github/scripts/stop_instances.py
env:
STARTED_INSTANCES: ${{ needs.start_instances.outputs.started_instances }}
STARTED_INSTANCES: ${{ needs.start_instances.outputs.started_instances }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-1
3 changes: 2 additions & 1 deletion apps/compile_server/resources/compilation.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def clone_github_repo(owner: str, repo: str, version: str) -> str:
branches = repo.git.branch("--all").split()
# If local branch already exists, delete it as we prepare to do a new fresh checkout
# This is because the local branch might be divergent with remote, so we just discard it
if version in branches:
# The exception is the main branch, since it should never diverge
if version in branches and version != 'main':
repo.git.checkout('main')
repo.git.branch('-D', version)
if 'pull/' in version:
Expand Down
34 changes: 25 additions & 9 deletions python/hidet/drivers/build_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def get_graph_weights(graph):


def get_graph_intermediates(graph):
"""
Get the intermediate tensors of the graph: {output tensors of nodes} - {output tensors of the graph}
"""
intermediates: List[Tensor] = []
for node in graph.nodes:
for y in node.outputs:
Expand Down Expand Up @@ -145,7 +148,12 @@ def get_graph_meta_data(graph: FlowGraph, num_kernels, space: int) -> GraphMetaD
graph_hash = sha256('\n'.join(lines).encode('utf-8')).hexdigest()[:16]

return GraphMetaData(
inputs=inputs, outputs=outputs, hidet_version=hidet.__version__, num_kernels=num_kernels, graph_hash=graph_hash
inputs=inputs,
outputs=outputs,
hidet_version=hidet.__version__,
num_kernels=num_kernels,
graph_hash=graph_hash,
share_map=graph.share_map,
)


Expand Down Expand Up @@ -191,11 +199,15 @@ def get_workspace_size_impl(cpu_size: Var, cuda_size: Var):
for idx in [cpu_idx, cuda_idx]:
sb += memory_planner_init(idx)
for node in graph_nodes:
for y in node.outputs:
for output_idx, y in enumerate(node.outputs):
if y in graph_intermediates:
sb += DeclareStmt(
tensor_ptr[y], init=memory_planner_allocate(device2idx[y.device.kind], tensor_size[y])
)
if node.share_map and y in node.share_map:
# share the memory with input tensor
input_idx: int = node.share_map[output_idx]
init_addr = tensor_ptr[node.inputs[input_idx]]
else:
init_addr = memory_planner_allocate(device2idx[y.device.kind], tensor_size[y])
sb += DeclareStmt(tensor_ptr[y], init=init_addr)
sb += AssignStmt(cpu_size, primitives.max(cpu_size, memory_planner_used(cpu_idx)))
sb += AssignStmt(cuda_size, primitives.max(cuda_size, memory_planner_used(cuda_idx)))
for x in node.inputs:
Expand Down Expand Up @@ -247,11 +259,15 @@ def launch_impl(inputs: List[Var], outputs: List[Var], p_kernels: Var):
node_params.append(d2w[x.device.kind] + tensor_ptr[x])
else:
raise RuntimeError("Unknown tensor {}".format(x))
for y in node.outputs:
for output_idx, y in enumerate(node.outputs):
if y in graph_intermediates:
sb += DeclareStmt(
tensor_ptr[y], init=memory_planner_allocate(d2i[y.device.kind], tensor_size[y])
)
if node.share_map and y in node.share_map:
# share the memory with input tensor
input_idx: int = node.share_map[output_idx]
init_addr = tensor_ptr[node.inputs[input_idx]]
else:
init_addr = memory_planner_allocate(d2i[y.device.kind], tensor_size[y])
sb += DeclareStmt(tensor_ptr[y], init=init_addr)
node_params.append(d2w[y.device.kind] + tensor_ptr[y])
elif y in graph.outputs:
node_params.append(outputs[graph.outputs.index(y)])
Expand Down
1 change: 1 addition & 0 deletions python/hidet/drivers/build_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def get_signature(t: TensorNode, device: str) -> TensorSignature:
symbols=[v.name for v in task.symbols],
inputs=[get_signature(t, input_device) for t in task.inputs],
outputs=[get_signature(t, output_device) for t in task.outputs],
share_map=task.share_map,
target=build_target,
num_candidates=num_candidates,
hidet_version=hidet.__version__,
Expand Down
Loading

0 comments on commit dbfcc56

Please sign in to comment.