diff --git a/benchmarks/scripts/cub/bench/__init__.py b/benchmarks/scripts/cub/bench/__init__.py index 88f739198..37736a04e 100644 --- a/benchmarks/scripts/cub/bench/__init__.py +++ b/benchmarks/scripts/cub/bench/__init__.py @@ -1,5 +1,5 @@ from .config import * -from .storage import Storage +from .storage import * from .bench import Bench from .cmake import CMake from .score import * diff --git a/benchmarks/scripts/cub/bench/storage.py b/benchmarks/scripts/cub/bench/storage.py index 7197bb7ca..152abc66c 100644 --- a/benchmarks/scripts/cub/bench/storage.py +++ b/benchmarks/scripts/cub/bench/storage.py @@ -12,14 +12,9 @@ def blob_to_samples(blob): return np.squeeze(fpzip.decompress(blob)) -class Storage: - _instance = None - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls, *args, **kwargs) - cls._instance.conn = sqlite3.connect(db_name) - return cls._instance +class StorageBase: + def __init__(self, db_path): + self.conn = sqlite3.connect(db_path) def connection(self): return self.conn @@ -42,3 +37,29 @@ def alg_to_df(self, algname): df['samples'] = df['samples'].apply(blob_to_samples) return df + + def store_df(self, algname, df): + df['samples'] = df['samples'].apply(fpzip.compress) + df.to_sql(algname, self.conn, if_exists='replace', index=False) + + +class Storage: + _instance = None + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls, *args, **kwargs) + cls._instance.base = StorageBase(db_name) + return cls._instance + + def connection(self): + return self.base.connection() + + def exists(self): + return self.base.exists() + + def algnames(self): + return self.base.algnames() + + def alg_to_df(self, algname): + return self.base.alg_to_df(algname) diff --git a/benchmarks/scripts/merge.py b/benchmarks/scripts/merge.py new file mode 100755 index 000000000..e32474360 --- /dev/null +++ b/benchmarks/scripts/merge.py @@ -0,0 +1,145 @@ +#!/bin/env python3 + +from scipy import stats + +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import numpy as np +import statistics +import functools +import argparse +import cub +import os + + +def valid_alpha(value): + fvalue = float(value) + if fvalue < 0 or fvalue > 1: + raise argparse.ArgumentTypeError( + "Alpha must be a float between 0 and 1.") + return fvalue + + +def file_exists(value): + if not os.path.isfile(value): + raise argparse.ArgumentTypeError(f"The file '{value}' does not exist.") + return value + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Process alpha and files.') + parser.add_argument('--alpha', type=valid_alpha, default=0.01, required=False, + help='Alpha value must be a float between 0 and 1.') + parser.add_argument('files', type=file_exists, nargs='+', + help='At least two files are required.') + parser.add_argument('--plot', + action=argparse.BooleanOptionalAction, + help="Show base distributions.") + args = parser.parse_args() + + if len(args.files) < 2: + parser.error("At least two files are required.") + + return args + + +def distributions_are_different(alpha, samples_list): + # H0: the distributions are not different + # H1: the distribution are different + result = stats.kruskal(*samples_list) + + # Reject H0 + return result.pvalue < alpha + + +def get_group_id(alpha, num_files, row): + if distributions_are_different(alpha, [row[fid] for fid in range(num_files)]): + data = {} + for fid in range(num_files): + data[f'file{fid}'] = row[fid] + sns.displot(data, kind="kde") + plt.show() + + +def plot(args): + dfs = {} + for fid, file in enumerate(args.files): + storage = cub.bench.StorageBase(file) + for alg in storage.algnames(): + df = storage.alg_to_df(alg) + df = df[df['variant'] == 'base'].drop(columns=['variant', 'center', 'elapsed']) + df['file'] = fid + if alg not in dfs: + dfs[alg] = [df] + else: + dfs[alg].append(df) + + get_group_id_closure = functools.partial(get_group_id, args.alpha, len(args.files)) + + for alg in dfs: + print(alg) + df = pd.concat(dfs[alg], ignore_index=True) + index = list(df.columns) + index.remove('samples') + index.remove('file') + df_pivot = df.pivot(index=index, columns='file', values='samples') + df_pivot.apply(get_group_id_closure, axis=1) + + +def combine_samples(num_files, row): + row = row.dropna() + combined_samples = [] + for fid in range(num_files): + if fid in row: + combined_samples.extend(row[fid]) + return np.asarray(sorted(combined_samples), dtype=np.float32) + + +def compute_center(row): + if len(row['samples']) == 0: + return float('inf') + return statistics.median(row['samples']) + + +def merge(args): + dfs = {} + for fid, file in enumerate(args.files): + storage = cub.bench.StorageBase(file) + for alg in storage.algnames(): + df = storage.alg_to_df(alg) + df['file'] = fid + if alg not in dfs: + dfs[alg] = [df] + else: + dfs[alg].append(df) + + combine_closure = functools.partial(combine_samples, len(args.files)) + + storage = cub.bench.StorageBase(cub.bench.db_name) + + for alg in dfs: + df = pd.concat(dfs[alg], ignore_index=True) + index = list(df.columns) + index.remove('samples') + index.remove('elapsed') + index.remove('center') + index.remove('file') + df_pivot = df.pivot(index=index, columns='file', values='samples') + df_pivot['samples'] = df_pivot.apply(combine_closure, axis=1) + df_pivot['center'] = df_pivot.apply(compute_center, axis=1) + df_pivot['elapsed'] = 0.0 # TODO compute min or sum, not sure + df_pivot.drop(columns=list(range(len(args.files))), inplace=True) + df_pivot.reset_index(inplace=True) + storage.store_df(alg, df_pivot) + + +def main(): + args = parse_arguments() + if args.plot: + plot(args) + merge(args) + + +if __name__ == "__main__": + main()