ProVADA/run_multiple.py at main · SUwonglab/ProVADA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
run_multiple.py

Runs multiple provada runs on the same config file. Utilizes one GPU per run.
"""

import os
import argparse
import subprocess
import random
import time
import tempfile
from pathlib import Path


def get_parser():
    """
    Get the parser for the run_multiple.py script.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, required=True)
    parser.add_argument("--seed", type=int, required=False, default=42)
    parser.add_argument("--available_gpus", type=int, nargs="+", required=True)
    parser.add_argument("--reference_distribution_file", type=str, default=None)
    parser.add_argument("--verbose", action="store_true")
    parser.add_argument("--sampler_type", type=str, required=False, default=None)
    return parser


def main():
    """
    Main function for the run_multiple.py script.
    """
    parser = get_parser()
    args = parser.parse_args()

    print(
        f"Launching {len(args.available_gpus)} runs on the following GPUs: {args.available_gpus}"
    )

    processes = []
    temp_output_files = []
    run_metadata = []

    # Create a logging_subdir name based on time
    logging_subdir = time.strftime("multiple_run_group_%Y-%m-%d_%H-%M-%S")

    # Set the random seed
    rng = random.Random(args.seed)

    # Launch a run for each visible device
    for job_idx, gpu in enumerate(args.available_gpus):
        # Set the visible devices
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        # Copy the env
        env = os.environ.copy()

        seed = rng.randint(0, 1000000)

        command = f"python run_provada.py --config {args.config} --device cuda:0 --seed {seed} --logging_subdir {logging_subdir}"

        if args.reference_distribution_file is not None:
            command += f" --reference_distribution_file {args.reference_distribution_file}"

        if args.sampler_type is not None:
            command += f" --sampler_type {args.sampler_type}"

        # Create temporary files to capture stdout/stderr
        temp_stdout = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.stdout')
        temp_stderr = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.stderr')
        temp_output_files.append((temp_stdout, temp_stderr))

        # Store metadata for this run
        config_name = Path(args.config).stem
        run_metadata.append({
            'gpu': gpu,
            'seed': seed,
            'config_name': config_name,
        })

        # Run the provada run
        # Always capture to temp files so we can check for early errors
        process = subprocess.Popen(
            command,
            shell=True,
            stdout=temp_stdout,
            stderr=temp_stderr,
            env=env,
        )
        processes.append(process)

    # Wait for all processes to complete
    for process in processes:
        process.wait()

    print("All runs completed")

    # Check for failures and handle error logging
    for i, (process, (temp_stdout, temp_stderr), metadata) in enumerate(zip(processes, temp_output_files, run_metadata)):
        exit_code = process.returncode
        gpu = metadata['gpu']
        seed = metadata['seed']
        config_name = metadata['config_name']

        if exit_code != 0:
            # Check if a log file was created (indicates the run got far enough to create logs)
            log_dir = Path("logs") / logging_subdir
            log_files = list(log_dir.glob(f"*{seed}*.log")) if log_dir.exists() else []

            # If no log file exists, this was an early failure - create one
            if not log_files:
                # Create the log directory if it doesn't exist
                log_dir.mkdir(parents=True, exist_ok=True)

                # Create a log file name similar to what would have been created
                # Format: config_name_randomstring_seed.log
                log_filename = f"{config_name}_early_failure_{seed}.log"
                log_path = log_dir / log_filename

                # Write the captured output to the log file
                with open(log_path, 'w') as log_file:
                    log_file.write(f"=" * 80 + "\n")
                    log_file.write(f"EARLY FAILURE - Process exited with code {exit_code}\n")
                    log_file.write(f"GPU: {gpu}, Seed: {seed}\n")
                    log_file.write(f"Config: {args.config}\n")
                    log_file.write(f"Command: python run_provada.py --config {args.config} --device cuda:0 --seed {seed}\n")
                    log_file.write(f"=" * 80 + "\n\n")

                    # Read and write stderr (most likely to contain the error)
                    temp_stderr.seek(0)
                    stderr_content = temp_stderr.read()
                    if stderr_content.strip():
                        log_file.write("STDERR:\n")
                        log_file.write(stderr_content)
                        log_file.write("\n\n")

                    # Read and write stdout
                    temp_stdout.seek(0)
                    stdout_content = temp_stdout.read()
                    if stdout_content.strip():
                        log_file.write("STDOUT:\n")
                        log_file.write(stdout_content)
                        log_file.write("\n")

                print(f"Process {process.pid} (GPU {gpu}, seed {seed}) failed early (exit code {exit_code}) - error logged to: {log_path}")
            else:
                print(f"Process {process.pid} (GPU {gpu}, seed {seed}) exited with code {exit_code} - check log: {log_files[0]}")
        else:
            print(f"Process {process.pid} (GPU {gpu}, seed {seed}) completed successfully")

        # Clean up temp files
        temp_stdout.close()
        temp_stderr.close()
        os.unlink(temp_stdout.name)
        os.unlink(temp_stderr.name)


if __name__ == "__main__":
    main()