Allow analysis script to process multiple dbs

gevtushenko · May 25, 2023 · 1e98052 · 1e98052
1 parent d583228
commit 1e98052
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 37 deletions.
diff --git a/benchmarks/scripts/analyze.py b/benchmarks/scripts/analyze.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import os
 import re
 import cub
 import math
@@ -174,33 +175,64 @@ def remove_matching_distributions(alpha, df):
     return df[df.apply(closure, axis=1)]
 
 
+def get_filenames_map(arr):
+    if not arr:
+        return []
+
+    prefix = arr[0]
+    for string in arr:
+        while not string.startswith(prefix):
+            prefix = prefix[:-1]
+        if not prefix:
+            break
+
+    return {string: string[len(prefix):] for string in arr}
+
+
 def iterate_case_dfs(args, callable):
-    storage = cub.bench.Storage()
+    storages = {}
+    algnames = set()
+    filenames_map = get_filenames_map(args.files)
+    for file in args.files:
+        storage = cub.bench.StorageBase(file)
+        algnames.update(storage.algnames())
+        storages[filenames_map[file]] = storage
+
     pattern = re.compile(args.R)
 
-    for algname in storage.algnames():
+    for algname in algnames:
         if not pattern.match(algname):
             continue
 
-        df = storage.alg_to_df(algname)
-        with pd.option_context('mode.use_inf_as_na', True):
-            df = df.dropna(subset=['center'], how='all')
-
-        for _, row in df[['ctk', 'cub']].drop_duplicates().iterrows():
-            ctk_version = row['ctk']
-            cub_version = row['cub']
-            ctk_cub_df = df[(df['ctk'] == ctk_version) &
-                            (df['cub'] == cub_version)]
-
-            for gpu in ctk_cub_df['gpu'].unique():
-                target_df = ctk_cub_df[ctk_cub_df['gpu'] == gpu]
-                target_df.drop(columns=['ctk', 'cub', 'gpu'], inplace=True)
-                target_df = compute_speedup(target_df)
-
-                for ct_point in ct_space(target_df):
-                    point_str = ", ".join(["{}={}".format(k, ct_point[k]) for k in ct_point])
-                    case_df = extract_complete_variants(extract_case(target_df, ct_point))
-                    callable(algname, point_str, case_df)
+        case_dfs = {}
+        for file in storages:
+            storage = storages[file]
+            df = storage.alg_to_df(algname)
+            with pd.option_context('mode.use_inf_as_na', True):
+                df = df.dropna(subset=['center'], how='all')
+
+            for _, row in df[['ctk', 'cub']].drop_duplicates().iterrows():
+                ctk_version = row['ctk']
+                cub_version = row['cub']
+                ctk_cub_df = df[(df['ctk'] == ctk_version) &
+                                (df['cub'] == cub_version)]
+
+                for gpu in ctk_cub_df['gpu'].unique():
+                    target_df = ctk_cub_df[ctk_cub_df['gpu'] == gpu]
+                    target_df.drop(columns=['ctk', 'cub', 'gpu'], inplace=True)
+                    target_df = compute_speedup(target_df)
+
+                    for ct_point in ct_space(target_df):
+                        point_str = ", ".join(["{}={}".format(k, ct_point[k]) for k in ct_point])
+                        case_df = extract_complete_variants(extract_case(target_df, ct_point))
+                        case_df['variant'] = case_df['variant'].astype(str) + " ({})".format(file)
+                        if point_str not in case_dfs:
+                            case_dfs[point_str] = case_df
+                        else:
+                            case_dfs[point_str] = pd.concat([case_dfs[point_str], case_df])
+
+        for point_str in case_dfs:
+            callable(algname, point_str, case_dfs[point_str])
 
 
 def case_top(alpha, N, algname, ct_point_name, case_df):
@@ -495,17 +527,24 @@ def variants(args, mode):
     iterate_case_dfs(args, functools.partial(case_variants, pattern, mode))
 
 
+def file_exists(value):
+    if not os.path.isfile(value):
+        raise argparse.ArgumentTypeError(f"The file '{value}' does not exist.")
+    return value
+
+
 def parse_arguments():
-    parser = argparse.ArgumentParser(
-        description="Analyze benchmark results.")
-    parser.add_argument('-R', type=str, default='.*',
-                        help="Regex for benchmarks selection.")
+    parser = argparse.ArgumentParser(description="Analyze benchmark results.")
+    parser.add_argument(
+        '-R', type=str, default='.*', help="Regex for benchmarks selection.")
     parser.add_argument(
         '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
     parser.add_argument(
         '--coverage', action=argparse.BooleanOptionalAction, help="Show variant space coverage.")
     parser.add_argument(
         '--top', default=7, type=int, action='store', nargs='?', help="Show top N variants with highest score.")
+    parser.add_argument(
+        'files', type=file_exists, nargs='+', help='At least one file is required.')
     parser.add_argument(
         '--alpha', default=1.0, type=float)
     parser.add_argument(
@@ -518,9 +557,6 @@ def parse_arguments():
 def main():
     args = parse_arguments()
 
-    if not cub.bench.Storage().exists():
-        raise Exception("Storage does not exist")
-
     if args.list_benches:
         cub.bench.list_benches()
         return

diff --git a/benchmarks/scripts/cub/bench/__init__.py b/benchmarks/scripts/cub/bench/__init__.py
@@ -1,5 +1,5 @@
 from .config import *
-from .storage import Storage
+from .storage import *
 from .bench import Bench
 from .cmake import CMake
 from .score import *

diff --git a/benchmarks/scripts/cub/bench/storage.py b/benchmarks/scripts/cub/bench/storage.py
@@ -12,14 +12,9 @@ def blob_to_samples(blob):
     return np.squeeze(fpzip.decompress(blob))
 
 
-class Storage:
-    _instance = None
-
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls, *args, **kwargs)
-            cls._instance.conn = sqlite3.connect(db_name)
-        return cls._instance
+class StorageBase:
+    def __init__(self, db_path):
+        self.conn = sqlite3.connect(db_path)
 
     def connection(self):
         return self.conn
@@ -42,3 +37,29 @@ def alg_to_df(self, algname):
             df['samples'] = df['samples'].apply(blob_to_samples)
 
         return df
+
+    def store_df(self, algname, df):
+        df['samples'] = df['samples'].apply(fpzip.compress)
+        df.to_sql(algname, self.conn, if_exists='replace', index=False)
+
+
+class Storage:
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls, *args, **kwargs)
+            cls._instance.base = StorageBase(db_name)
+        return cls._instance
+
+    def connection(self):
+        return self.base.connection()
+
+    def exists(self):
+        return self.base.exists()
+
+    def algnames(self):
+        return self.base.algnames()
+
+    def alg_to_df(self, algname):
+        return self.base.alg_to_df(algname)