-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubmission.py
264 lines (212 loc) · 10.9 KB
/
submission.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# Library Import
import getpass
import os
import shutil
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from matplotlib.ticker import FuncFormatter
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Function to generate final submission csv file
def generate_submission_csv(genre_predictions, filename="submission.csv"):
submission_df = pd.DataFrame(data={
"Id": range(len(genre_predictions)),
"Genre": genre_predictions
})
submission_df.to_csv(filename, index=False)
print(f"Submission file '{filename}' created successfully.")
# Function to compute pseudo test set accuracy
def calculate_pseudo_test_accuracy(predictions):
print(f"Pseudo Test Set accuracy: {accuracy_score(Y_test, predictions):.2f}")
# Function to compute training set accuracy
def calculate_training_accuracy(predictions):
print(f"Training Set accuracy: {accuracy_score(Y_train, predictions):.2f}")
# Load the training data and the test inputs
x_train = pd.read_csv('Data/X_train.csv', index_col=0, header=[0, 1, 2])
x_train_np = np.array(x_train)
y_train = pd.read_csv('Data/y_train.csv', index_col=0)
y_train_np = y_train.squeeze().to_numpy() # Make y_train a NumPy array
x_test = pd.read_csv('Data/X_test.csv', index_col=0, header=[0, 1, 2])
x_test_np = np.array(x_test)
# Flatten the columns for easier wrangling
x_train_flat_columns = ['_'.join(col).strip() for col in x_train.columns.values]
x_train.columns = x_train_flat_columns
x_test_flat_columns = ['_'.join(col).strip() for col in x_test.columns.values]
x_test.columns = x_train_flat_columns
# Label-encode training labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_np.ravel()) #
# Split training data into training and temporary validation sets
X_train, X_temp, Y_train, Y_temp = train_test_split(x_train, y_train_encoded, test_size=0.4, random_state=42)
# Split the temporary validation set into validation and pseudo test set
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)
# Standardise respective subsets after splitting to avoid data leakage
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_real_test_scaled = scaler.transform(x_test) # real test set to generate submission on
# Load best XGB model
final_model_name = 'Models/xgboost-64%-all-data'
final_booster = xgb.Booster() # instantiate
final_booster.load_model(final_model_name) # load
train_predictions = final_booster.predict(xgb.DMatrix(X_train_scaled)) # predict on train set
pseudo_test_predictions = final_booster.predict(xgb.DMatrix(X_test_scaled)) # predict on pseudo-test set
real_test_predictions = final_booster.predict(xgb.DMatrix(X_real_test_scaled)) # predict on real test set
# Decode numeric predictions to string labels
genre_predictions_decoded = label_encoder.inverse_transform(real_test_predictions.astype(int))
# Make submission csv with decoded predictions
generate_submission_csv(genre_predictions_decoded, filename="submission.csv")
# ######## MAKE PLOTS ########
export_username = "ts" # Only save plots to dropbox on right machine
# Function to save plots to EPS for overleaf
def save_plot(plot, filename):
username = getpass.getuser()
filepath = "/Users/ts/Library/CloudStorage/Dropbox/Apps/Overleaf/SML Practical/Figures"
filename += ".eps"
if username == export_username:
plot.savefig(os.path.join(filepath, filename), format='eps') # Save as EPS
print("Saved plot to {}".format(filename))
# Make EDA Plots
# PCA Plot
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
idx_full_80 = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.8)[0][0]
idx_full_90 = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.9)[0][0]
pcaplot = plt.figure(figsize=(10, 6))
# Plot the cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
plt.plot(cumulative_variance, color=plt.cm.viridis(0.5))
plt.xlabel('Number of Components', fontsize=14)
plt.ylabel('Cumulative Explained Variance', fontsize=14)
plt.yticks(np.arange(0, 1, step=0.1))
y_80 = cumulative_variance[idx_full_80]
y_90 = cumulative_variance[idx_full_90]
# noinspection PyTypeChecker
plt.axvline(x=idx_full_80, ymax=y_80, color=plt.cm.viridis(0.3), linestyle='--')
# noinspection PyTypeChecker
plt.axhline(y=y_80, xmax=idx_full_80 / len(cumulative_variance), color=plt.cm.viridis(0.4), linestyle='--')
# noinspection PyTypeChecker
plt.axvline(x=idx_full_90, ymax=y_90, color=plt.cm.viridis(0.6), linestyle='--')
# noinspection PyTypeChecker
plt.axhline(y=y_90, xmax=idx_full_90 / len(cumulative_variance), color=plt.cm.viridis(0.7), linestyle='--')
# Scatter points with adjusted Viridis colors
plt.scatter(idx_full_80, y_80, color=plt.cm.viridis(0.3), label='80% variance')
plt.scatter(idx_full_90, y_90, color=plt.cm.viridis(0.6), label='90% variance')
plt.legend(loc='best')
save_plot(pcaplot, "pca")
# Class Balance Plot
viridis_colors = plt.cm.viridis(np.linspace(0, 1, 8))
custom_palette = [matplotlib.colors.rgb2hex(color) for color in viridis_colors]
class_bal = plt.figure(figsize=(10, 6))
sns.countplot(data=y_train, y='Genre', palette=custom_palette)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Genre', fontsize=14)
# Adjust tick label sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
save_plot(class_bal, "Class-Balance")
x_train_with_genre = x_train.merge(y_train, left_index=True, right_on='Id') # Merge Genre labels on to training data
box1, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9)) # Create the subplots
sns.boxplot(x='spectral_centroid_median_01', y='Genre', data=x_train_with_genre, ax=axs[0, 0], palette=custom_palette)
axs[0, 0].set_title('Spectral Centroid Median 01')
sns.boxplot(x='spectral_rolloff_median_01', y='Genre', data=x_train_with_genre, ax=axs[0, 1], palette=custom_palette)
axs[0, 1].set_title('Spectral Rolloff Median 01')
sns.boxplot(x='spectral_contrast_median_04', y='Genre', data=x_train_with_genre, ax=axs[1, 0], palette=custom_palette)
axs[1, 0].set_title('Spectral Contrast Median 04')
sns.boxplot(x='mfcc_median_01', y='Genre', data=x_train_with_genre, ax=axs[1, 1], palette=custom_palette)
axs[1, 1].set_title('MFCC Median 01')
sns.set(font_scale=1.6) # Adjust the font scale for better readability
plt.tight_layout()
save_plot(box1, "boxplot-1")
# Correlation matrix
df_corr = X_train.filter(like='spectral_contrast')
corr_mat = df_corr.corr()
cormat = plt.figure(figsize=(16, 13))
sns.heatmap(corr_mat, cmap='viridis')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels
plt.xlabel('') # Remove x-axis title
plt.ylabel('') # Remove y-axis title
plt.tight_layout()
save_plot(cormat, "correlation")
# Get decoded class labels for plots
y_test_decoded = label_encoder.inverse_transform(Y_test)
pseudo_test_preds_labels = label_encoder.inverse_transform(pseudo_test_predictions.astype(int))
calculate_training_accuracy(train_predictions)
calculate_pseudo_test_accuracy(pseudo_test_predictions)
# Make XGB Visualizations
# Retrieve column names
feature_names = x_train_flat_columns
# Custom formatter to one decimal place
formatter = FuncFormatter(lambda x, _: f'{x:.1f}')
# Define a list of colors for the bar plots
colors = plt.cm.viridis(np.linspace(0, 1, 4))
# Create the subplots with constrained_layout instead of tight_layout
importanceplots, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12), constrained_layout=True)
# Define importance types and corresponding titles
importance_types = ['weight', 'gain', 'cover', 'total_gain']
titles = ['Weight', 'Gain', 'Cover', 'Total Gain']
# Plot importance for each type
for i, ax in enumerate(axs.flat):
xgb.plot_importance(final_booster, importance_type=importance_types[i], max_num_features=10, ax=ax,
show_values=False, color=colors[i])
ax.xaxis.set_major_formatter(formatter)
ax.set_xlabel('Importance')
ax.set_title(titles[i])
ticks = ax.get_yticklabels()
indices = [int(tick.get_text().replace('f', '')) for tick in ticks]
new_labels = [feature_names[i] for i in indices]
ax.set_yticklabels(new_labels)
save_plot(importanceplots, "XGB-Importance")
# Plot Misprediction Frequency by class
# Calculate mispredictions
mispredictions = (y_test_decoded != pseudo_test_preds_labels)
# Count the total occurrences for each class in the true test set
total_counts = Counter(y_test_decoded)
# Count mispredictions for each decoded class
mispredicted_counts = Counter(y_test_decoded[mispredictions])
# Calculate misprediction frequencies as a percentage
misprediction_freq = {class_label: (mispredicted_counts.get(class_label, 0) / total_counts[class_label]) * 100
for class_label in total_counts}
# Sort the classes by name to maintain consistent order
sorted_class_labels = sorted(total_counts.keys())
# Prepare colors, one for each class
colors = plt.cm.viridis(np.linspace(0, 1, len(sorted_class_labels)))
# Bar chart of misprediction frequencies (as percentages)
xgb_mispred_freq = plt.figure(figsize=(10, 6))
plt.bar(sorted_class_labels, [misprediction_freq[class_label] for class_label in sorted_class_labels], color=colors)
plt.xlabel('Classes')
plt.ylabel('Misprediction Frequency (%)')
plt.xticks(ticks=range(len(sorted_class_labels)), labels=sorted_class_labels, rotation=45)
plt.subplots_adjust(bottom=0.3) # Increase the bottom margin
save_plot(xgb_mispred_freq, "xgb_mispred_freq")
# Classification Report Heatmap
# Plot the classification report as a heatmap
report_dict = classification_report(Y_test, pseudo_test_predictions, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
# Extract unique class names in the correct order from y_test_decoded
unique_class_names = label_encoder.inverse_transform(sorted(np.unique(Y_test)))
# Drop the 'support' column and rows with averages, since we only want the individual classes
report_df = report_df.drop(columns=['support'])
class_report_df = report_df.iloc[:-3, :]
heatmap = plt.figure(figsize=(10, 8))
sns.heatmap(class_report_df, cmap='viridis', cbar=True, fmt='.2g',
annot_kws={'color': 'black'}, # Add contrasting color for readability
yticklabels=unique_class_names)
plt.ylabel('Class Label', fontsize=14)
plt.xlabel('Metrics', fontsize=14)
heatmap.subplots_adjust(left=0.2)
save_plot(heatmap, "XGB-Heatmap")
# Copy code to overleaf
shutil.copy('submission.py', '/Users/ts/Library/CloudStorage/Dropbox/Apps/Overleaf/SML Practical/Code')
shutil.copy('tuning.txt', '/Users/ts/Library/CloudStorage/Dropbox/Apps/Overleaf/SML Practical/Code')
print("Source Code and Tuning log copied to Overleaf")