|
45 | 45 | reimport_benchmarks_report, |
46 | 46 | ) |
47 | 47 | from guidellm.mock_server import MockServer, MockServerConfig |
48 | | -from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset |
49 | 48 | from guidellm.scheduler import StrategyType |
50 | 49 | from guidellm.schemas import GenerativeRequestType |
51 | 50 | from guidellm.settings import print_config |
52 | 51 | from guidellm.utils import Console, DefaultGroupHandler, get_literal_vals |
53 | 52 | from guidellm.utils import cli as cli_tools |
54 | 53 |
|
55 | | -__all__ = [ |
56 | | - "STRATEGY_PROFILE_CHOICES", |
57 | | - "benchmark", |
58 | | - "cli", |
59 | | - "config", |
60 | | - "dataset", |
61 | | - "decode_escaped_str", |
62 | | - "from_file", |
63 | | - "mock_server", |
64 | | - "preprocess", |
65 | | - "run", |
66 | | -] |
67 | | - |
68 | 54 | STRATEGY_PROFILE_CHOICES: list[str] = list(get_literal_vals(ProfileType | StrategyType)) |
69 | 55 | """Available strategy and profile type choices for benchmark execution.""" |
70 | 56 |
|
@@ -469,128 +455,6 @@ def preprocess(): |
469 | 455 | """Dataset preprocessing utilities.""" |
470 | 456 |
|
471 | 457 |
|
472 | | -@preprocess.command( |
473 | | - "dataset", |
474 | | - help=( |
475 | | - "Process a dataset to have specific prompt and output token sizes. " |
476 | | - "Supports multiple strategies for handling prompts and optional " |
477 | | - "Hugging Face Hub upload.\n\n" |
478 | | - "DATA: Path to the input dataset or dataset ID.\n\n" |
479 | | - "OUTPUT_PATH: Path to save the processed dataset, including file suffix." |
480 | | - ), |
481 | | - context_settings={"auto_envvar_prefix": "GUIDELLM"}, |
482 | | -) |
483 | | -@click.argument( |
484 | | - "data", |
485 | | - type=str, |
486 | | - required=True, |
487 | | -) |
488 | | -@click.argument( |
489 | | - "output_path", |
490 | | - type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True), |
491 | | - required=True, |
492 | | -) |
493 | | -@click.option( |
494 | | - "--processor", |
495 | | - type=str, |
496 | | - required=True, |
497 | | - help="Processor or tokenizer name for calculating token counts.", |
498 | | -) |
499 | | -@click.option( |
500 | | - "--processor-args", |
501 | | - default=None, |
502 | | - callback=cli_tools.parse_json, |
503 | | - help="JSON string of arguments to pass to the processor constructor.", |
504 | | -) |
505 | | -@click.option( |
506 | | - "--data-args", |
507 | | - callback=cli_tools.parse_json, |
508 | | - help="JSON string of arguments to pass to dataset creation.", |
509 | | -) |
510 | | -@click.option( |
511 | | - "--short-prompt-strategy", |
512 | | - type=click.Choice([s.value for s in ShortPromptStrategy]), |
513 | | - default=ShortPromptStrategy.IGNORE.value, |
514 | | - show_default=True, |
515 | | - help="Strategy for handling prompts shorter than target length.", |
516 | | -) |
517 | | -@click.option( |
518 | | - "--pad-char", |
519 | | - type=str, |
520 | | - default="", |
521 | | - callback=decode_escaped_str, |
522 | | - help="Character to pad short prompts with when using 'pad' strategy.", |
523 | | -) |
524 | | -@click.option( |
525 | | - "--concat-delimiter", |
526 | | - type=str, |
527 | | - default="", |
528 | | - help=( |
529 | | - "Delimiter for concatenating short prompts (used with 'concatenate' strategy)." |
530 | | - ), |
531 | | -) |
532 | | -@click.option( |
533 | | - "--prompt-tokens", |
534 | | - type=str, |
535 | | - default=None, |
536 | | - help="Prompt tokens configuration (JSON, YAML file, or key=value string).", |
537 | | -) |
538 | | -@click.option( |
539 | | - "--output-tokens", |
540 | | - type=str, |
541 | | - default=None, |
542 | | - help="Output tokens configuration (JSON, YAML file, or key=value string).", |
543 | | -) |
544 | | -@click.option( |
545 | | - "--push-to-hub", |
546 | | - is_flag=True, |
547 | | - help="Push the processed dataset to Hugging Face Hub.", |
548 | | -) |
549 | | -@click.option( |
550 | | - "--hub-dataset-id", |
551 | | - type=str, |
552 | | - default=None, |
553 | | - help=("Hugging Face Hub dataset ID for upload (required if --push-to-hub is set)."), |
554 | | -) |
555 | | -@click.option( |
556 | | - "--random-seed", |
557 | | - type=int, |
558 | | - default=42, |
559 | | - show_default=True, |
560 | | - help="Random seed for reproducible token sampling.", |
561 | | -) |
562 | | -def dataset( |
563 | | - data, |
564 | | - output_path, |
565 | | - processor, |
566 | | - processor_args, |
567 | | - data_args, |
568 | | - short_prompt_strategy, |
569 | | - pad_char, |
570 | | - concat_delimiter, |
571 | | - prompt_tokens, |
572 | | - output_tokens, |
573 | | - push_to_hub, |
574 | | - hub_dataset_id, |
575 | | - random_seed, |
576 | | -): |
577 | | - process_dataset( |
578 | | - data=data, |
579 | | - output_path=output_path, |
580 | | - processor=processor, |
581 | | - prompt_tokens=prompt_tokens, |
582 | | - output_tokens=output_tokens, |
583 | | - processor_args=processor_args, |
584 | | - data_args=data_args, |
585 | | - short_prompt_strategy=short_prompt_strategy, |
586 | | - pad_char=pad_char, |
587 | | - concat_delimiter=concat_delimiter, |
588 | | - push_to_hub=push_to_hub, |
589 | | - hub_dataset_id=hub_dataset_id, |
590 | | - random_seed=random_seed, |
591 | | - ) |
592 | | - |
593 | | - |
594 | 458 | @cli.command( |
595 | 459 | "mock-server", |
596 | 460 | help=( |
|
0 commit comments