Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate formatter from yapf to ruff #1688

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions .github/workflows/yapf.yml → .github/workflows/ruff-format.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: yapf
name: ruff format

on:
# Trigger the workflow on push or pull request,
Expand All @@ -10,7 +10,7 @@ on:
branches:
- main
jobs:
yapf:
ruff-format:
runs-on: ubuntu-latest
strategy:
matrix:
Expand All @@ -24,8 +24,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
pip install ruff==0.1.5
- name: Running format
run: |
yapf --diff --recursive vllm tests
ruff format --check .
73 changes: 38 additions & 35 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ def run_to_completion(profile: bool = False):
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()

llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
llm.generate(
prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False,
)

end_time = time.perf_counter()
latency = end_time - start_time
Expand All @@ -59,43 +61,44 @@ def run_to_completion(profile: bool = False):
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile=False))
print(f'Avg latency: {np.mean(latencies)} seconds')
print(f"Avg latency: {np.mean(latencies)} seconds")


if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', 'squeezellm', None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
parser.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt.')
parser.add_argument('--use-beam-search', action='store_true')
parser.add_argument('--num-iters',
type=int,
default=3,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
description="Benchmark the latency of processing a single batch of "
"requests till completion."
)
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument(
"--quantization", "-q", choices=["awq", "squeezellm", None], default=None
)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument(
"--n", type=int, default=1, help="Number of generated sequences per prompt."
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
'--dtype',
"--num-iters", type=int, default=3, help="Number of iterations to run."
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="trust remote code from huggingface",
)
parser.add_argument(
"--dtype",
type=str,
default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
default="auto",
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
help="data type for model weights and activations. "
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
"for FP32 and FP16 models, and BF16 precision "
"for BF16 models.",
)
args = parser.parse_args()
main(args)
102 changes: 64 additions & 38 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ def sample_requests(
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [
data for data in dataset
if len(data["conversations"]) >= 2
]
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [
(data["conversations"][0]["value"], data["conversations"][1]["value"])
Expand Down Expand Up @@ -164,9 +161,17 @@ async def benchmark(
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
task = asyncio.create_task(send_request(backend, api_url, prompt,
prompt_len, output_len,
best_of, use_beam_search))
task = asyncio.create_task(
send_request(
backend,
api_url,
prompt,
prompt_len,
output_len,
best_of,
use_beam_search,
)
)
tasks.append(task)
await asyncio.gather(*tasks)

Expand All @@ -181,8 +186,16 @@ def main(args: argparse.Namespace):
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)

benchmark_start_time = time.perf_counter()
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
args.use_beam_search, args.request_rate))
asyncio.run(
benchmark(
args.backend,
api_url,
input_requests,
args.best_of,
args.use_beam_search,
args.request_rate,
)
)
benchmark_end_time = time.perf_counter()
benchmark_time = benchmark_end_time - benchmark_start_time
print(f"Total time: {benchmark_time:.2f} s")
Expand All @@ -191,43 +204,56 @@ def main(args: argparse.Namespace):
# Compute the latency statistics.
avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
print(f"Average latency: {avg_latency:.2f} s")
avg_per_token_latency = np.mean([
latency / (prompt_len + output_len)
for prompt_len, output_len, latency in REQUEST_LATENCY
])
avg_per_token_latency = np.mean(
[
latency / (prompt_len + output_len)
for prompt_len, output_len, latency in REQUEST_LATENCY
]
)
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
avg_per_output_token_latency = np.mean([
latency / output_len
for _, output_len, latency in REQUEST_LATENCY
])
print("Average latency per output token: "
f"{avg_per_output_token_latency:.2f} s")
avg_per_output_token_latency = np.mean(
[latency / output_len for _, output_len, latency in REQUEST_LATENCY]
)
print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument("--backend", type=str, default="vllm",
choices=["vllm", "tgi"])
description="Benchmark the online serving throughput."
)
parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "tgi"])
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--dataset", type=str, required=True,
help="Path to the dataset.")
parser.add_argument("--tokenizer", type=str, required=True,
help="Name or path of the tokenizer.")
parser.add_argument("--best-of", type=int, default=1,
help="Generates `best_of` sequences per prompt and "
"returns the best one.")
parser.add_argument(
"--dataset", type=str, required=True, help="Path to the dataset."
)
parser.add_argument(
"--tokenizer", type=str, required=True, help="Name or path of the tokenizer."
)
parser.add_argument(
"--best-of",
type=int,
default=1,
help="Generates `best_of` sequences per prompt and " "returns the best one.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts", type=int, default=1000,
help="Number of prompts to process.")
parser.add_argument("--request-rate", type=float, default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.")
parser.add_argument(
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
)
parser.add_argument(
"--request-rate",
type=float,
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code', action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="trust remote code from huggingface",
)
args = parser.parse_args()
main(args)
Loading
Loading