From 604bb1678bb97d4c8ba004762bcaf954fd267ddf Mon Sep 17 00:00:00 2001 From: Davide Tricella <56402624+dadit97@users.noreply.github.com> Date: Sun, 11 Sep 2022 23:54:42 +0200 Subject: [PATCH] Fix: test_labels and train_labels reshape added --- src/main.py | 2 +- src/preprocess.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 06fe966..acc0160 100644 --- a/src/main.py +++ b/src/main.py @@ -96,7 +96,7 @@ def make_roc(labels, results, name): # If true, the balancing will be done before resulting in a great performances gain earlyBalance = True problem_to_solve = 'CANCELLED' # The alternative is 'DIVERTED' -usePyspark = False # If true, uses PySpark, otherwise Pandas +usePyspark = True # If true, uses PySpark, otherwise Pandas # If false, only #records_per_file records will be sampled from the most recent year csv sample_from_all_files = True records_per_file = 500000 diff --git a/src/preprocess.py b/src/preprocess.py index b478df3..664ab88 100644 --- a/src/preprocess.py +++ b/src/preprocess.py @@ -121,6 +121,9 @@ def preprocess(index: str, useAllFrames: bool, size: int, balance_size: int, use numpy.array(test_data.collect()), numpy.array(test_labels.collect())) + result[1].shape = [result[1].shape[0]] + result[3].shape = [result[3].shape[0]] + finish_time = tm.time() - start_time print_and_save_time("Dataset splitting concluded: " + str(finish_time) + " seconds")