diff --git a/benchmarks/README.md b/benchmarks/README.md index ed2f776..58a1253 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1 +1,5 @@ # a suite of a bit longer (regression) tests + +By running `data_regression_experiment.sh`, you can conduct a stand-alone experiment that demonstrates the rankings' capability of approximating the scores obtained by using the full data set. + +![comparison](./comparison.png) diff --git a/benchmarks/analyse_rankings.py b/benchmarks/analyse_rankings.py new file mode 100644 index 0000000..cc0f724 --- /dev/null +++ b/benchmarks/analyse_rankings.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import glob +import os +import sys + +import matplotlib.pyplot as plt + +def extract_just_ranking(dfile): + """Extract ranking from an output file.""" + ranks = [] + with open(dfile) as df: + next(df) # Skip header line + for line in df: + parts = line.strip().split('\t') + ranks.append(parts[1]) + return ranks + +def calculate_mismatch_scores(all_folders, mismatches): + """Calculate mismatch scores based on ranking files.""" + all_counts = [int(folder.split('_').pop()) for folder in all_folders if 'ranking' in folder] + + ranking_out_struct = {} + for count in all_counts: + rpath = os.path.join(dfolder, f'ranking_{count}', 'feature_singles.tsv') + ranking_out_struct[count] = extract_just_ranking(rpath) + + pivot_score_key = max(all_counts) + reference_ranking = ranking_out_struct[pivot_score_key] + + out_results = {} + for ranking_id, ranking in ranking_out_struct.items(): + mismatches_counter = 0 + for el in ranking[:mismatches]: + if el not in reference_ranking[:mismatches]: + mismatches_counter += 1 + out_results[ranking_id] = 100 * (1 - mismatches_counter / mismatches) + + return dict(sorted(out_results.items(), key=lambda x: x[0])) + +def plot_precision_curve(results, pivot_score_key, mismatches, axs, c1, c2): + """Plot the precision curve based on mismatch results.""" + instances = [100 * (k / pivot_score_key) for k in results.keys()] + values = list(results.values()) + + axs[c1,c2].plot(instances, values, marker='o', linestyle='-', color='black') + axs[c1,c2].invert_xaxis() + axs[c1,c2].set(xlabel='Proportion of data used (%)', ylabel=f'hits@{mismatches} (%)', title=f'Approximation, top {mismatches} Features') + axs[c1,c2].grid(True) + +if __name__ == '__main__': + if len(sys.argv) < 2: + print('Usage: python script.py ') + sys.exit(1) + + dfolder = sys.argv[1] + mismatch_range = [1, 5, 10, 20] + fig, axs = plt.subplots(2, 2) + fig.set_figheight(10) + fig.set_figwidth(10) + row = -1 + for enx, mismatches in enumerate(mismatch_range): + if enx % 2 == 0: + row += 1 + col = enx % 2 + all_folders = list(glob.glob(os.path.join(dfolder, '*'))) + out_results = calculate_mismatch_scores(all_folders, mismatches) + pivot_score_key = max(out_results) + plot_precision_curve(out_results, pivot_score_key, mismatches, axs, row, col) + plt.tight_layout() + plt.savefig('comparison.png', dpi=300) diff --git a/benchmarks/comparison.png b/benchmarks/comparison.png new file mode 100644 index 0000000..382bdcf Binary files /dev/null and b/benchmarks/comparison.png differ diff --git a/benchmarks/data_regression_experiment.sh b/benchmarks/data_regression_experiment.sh new file mode 100644 index 0000000..0097ab3 --- /dev/null +++ b/benchmarks/data_regression_experiment.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +set -euo pipefail # Enable strict mode for safety + +# Configurable variables +NUM_ROWS=1000000 +NUM_FEATURES=100 +INPUT_FILE="test_data_synthetic/data.csv" +SIZES=('50000' '100000' '200000' '500000' '600000' '700000' '800000' '900000' '1000000') + +# Function to remove a directory safely +remove_directory_safely() { + directory_to_remove=$1 + if [ -d "$directory_to_remove" ]; then + echo "Removing directory: $directory_to_remove" + rm -rvf "$directory_to_remove" + else + echo "Directory does not exist, skipping: $directory_to_remove" + fi +} + +# Function to generate random data +generate_data() { + echo "Generating random data files with $NUM_ROWS rows and $NUM_FEATURES features..." + outrank --task data_generator --num_synthetic_rows $NUM_ROWS --num_synthetic_features $NUM_FEATURES + echo "Random data generation complete." +} + +# Function to create subspaces from the data +sample_subspaces() { + for i in "${SIZES[@]}" + do + dataset="test_data_synthetic/dataset_$i" + outfile="$dataset/data.csv" + mkdir -p "$dataset" + + if [ -f "$INPUT_FILE" ]; then + echo "Sampling $i rows into $outfile..." + head -n $i "$INPUT_FILE" > "$outfile" + echo "Sampling for $outfile done." + else + echo "Input file $INPUT_FILE not found. Skipping sampling for $i rows." + fi + done +} + +# Function to perform feature ranking +feature_ranking() { + for i in "${SIZES[@]}" + do + dataset="test_data_synthetic/dataset_$i" + output_folder="./test_data_synthetic/ranking_$i" + + if [ ! -d "$dataset" ]; then + echo "Dataset directory $dataset does not exist. Skipping ranking for $i rows." + continue + fi + + echo "Proceeding with feature ranking for $i rows..." + outrank --task ranking --data_path "$dataset" --data_source csv-raw \ + --combination_number_upper_bound 60 --output_folder "$output_folder" \ + --disable_tqdm True + + echo "Feature ranking summary for $i rows." + outrank --task ranking_summary --output_folder "$output_folder" --data_path "$dataset" + echo "Ranking for $i done." + done +} + +# Function to analyze the rankings +analyse_rankings() { + echo "Analyzing the rankings..." + python analyse_rankings.py test_data_synthetic + echo "Analysis complete." +} + +# Main script execution +remove_directory_safely test_data_synthetic/ +generate_data +sample_subspaces +feature_ranking +analyse_rankings + +echo "Script execution finished."