EEG-ML/fileprocessor.py at main · bkutasi/EEG-ML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import numpy as np
from numpy import ndarray
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import minmax_scale
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler


class FileProcessor:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.x_resampled = None
        self.y_resampled = None

    def preprocessor(self):
        """
        Preprocess the input data for machine learning tasks.

        This method performs several preprocessing steps on the input data:
        1. Reshapes the input data from 3D to 2D
        2. Normalizes the data using StandardScaler
        3. Applies min-max scaling
        4. Performs one-hot encoding on the labels
        5. Splits the data into training and validation sets
        6. Reshapes the data back to 3D format
        7. Equalizes the samples across classes

        Returns:
            tuple: A tuple containing four elements:
                - self.x (numpy.ndarray): Preprocessed training input data
                - self.y (numpy.ndarray): Preprocessed training labels
                - x_val (numpy.ndarray): Preprocessed validation input data
                - y_val (numpy.ndarray): Preprocessed validation labels

        Note:
            This method modifies the `x` and `y` attributes of the FileProcessor instance.
        """
        # Reshaping to 2D to complete basic preprocessing
        x_shape = self.x.shape
        self.x = np.reshape(
            self.x, (self.x.shape[0], self.x.shape[1] * self.x.shape[2])
        )

        # Normalize the data
        scaler = StandardScaler()
        self.x = scaler.fit_transform(self.x)

        # Minmax scaling
        self.x = minmax_scale(self.x, axis=1, feature_range=(-1, 1), copy=False)

        # One_hot encoding
        self.y = FileProcessor.to_one_hot(self.y)

        # Creating and subtracting the validation dataset before applying smote
        split_index = int(len(self.x) * 0.8)

        # This way the validation dataset doesn't poisons our training set any way
        x_val = self.x[split_index:]
        y_val = self.y[split_index:]
        self.x = self.x[:split_index]
        self.y = self.y[:split_index]

        # x_scaled, y_one_hot = FileProcessor.remove_majority_class(x_scaled, y_one_hot)

        # Synthetic Minority Oversampling Technique
        # self.x, self.y = FileProcessor.smote_processor(self.x, self.y)

        """
        # two 2d empty arrays
        x_smote, y_smote = np.ones((0, self.x.shape[1])), np.ones((0, self.y.shape[1]))
        # Reshaping back into the orig 3D format
        # Structure: Epochs x Channels x Datapoints
        x_smote = np.reshape(
            x_smote,
            (x_smote.shape[0], x_smote.shape[1] // x_shape[1], x_shape[1]),
        )
        """
        # Commented out smote processing to save memory
        # x_smote, y_smote = 0, 0
        self.x = np.reshape(
            self.x,
            (self.x.shape[0], self.x.shape[1] // x_shape[1], x_shape[1]),
        )
        x_val = np.reshape(
            x_val, (x_val.shape[0], x_val.shape[1] // x_shape[1], x_shape[1])
        )

        # Equalize the data
        self.x, self.y = FileProcessor.equalize_samples(self.x, self.y)

        return self.x, self.y, x_val, y_val

    @staticmethod
    def to_one_hot(y):
        # shallow copy to a new array
        y_copy = y.copy()
        # New unique labels in case of double vals(maybe there are duplicates)
        total_labels = np.unique(y_copy)

        # Dictionary named encoding for labels
        encoding = {}
        for x in range(len(total_labels)):
            encoding[total_labels[x]] = x
        for x in range(len(y_copy)):
            y_copy[x] = encoding[y_copy[x]]

        return to_categorical(y_copy)

    @staticmethod
    def smote_processor(x, y):
        """SMOTE Data augmentation
        print("Class instances before SMOTE: ", y.sum(axis=0))
        print("Class instances after SMOTE: ", y_smote.sum(axis=0))
        """
        sm = SMOTE(random_state=42)
        x, y = sm.fit_resample(x, y)
        return x, y

    @staticmethod
    def equalize_samples(x: ndarray, y: ndarray) -> tuple[ndarray, ndarray]:
        """
        Equalizes the data by reducing the sample sizes of the class that has too many samples.

        Args:
            x: The input data array.
            y: The labels array.

        Returns:
            The equalized data as a tuple of arrays (new_x, new_y).
        """
        # Calculate the number of samples in each class
        class_counts = y.sum(axis=0)

        # Find the minimum number of samples in any class
        min_samples = int(np.min(class_counts))

        # Create masks for each class
        class_masks = [y[:, i] == 1 for i in range(y.shape[1])]

        # Apply the masks to x and y, and limit the number of samples in each class
        new_x = np.concatenate(
            [x[class_mask][:min_samples] for class_mask in class_masks]
        )
        new_y = np.concatenate(
            [y[class_mask][:min_samples] for class_mask in class_masks]
        )

        return new_x, new_y

    @staticmethod
    def remove_majority_class(x: ndarray, y: ndarray) -> tuple[ndarray, ndarray]:
        """
        Removes the majority class completely from the data.

        Args:
            x: The input data array.
            y: The labels array.

        Returns:
            The modified data as a tuple of arrays (new_x, new_y).
        """
        # Calculate the number of samples in each class
        class_counts = y.sum(axis=0)

        # Find the index of the majority class
        majority_class_index = np.argmax(class_counts)

        # Create masks for each class
        class_masks = [y[:, i] == 1 for i in range(y.shape[1])]

        # Remove the majority class
        class_masks.pop(majority_class_index)

        # Apply the masks to x and y
        new_x = np.concatenate([x[class_mask] for class_mask in class_masks])
        new_y = np.concatenate([y[class_mask] for class_mask in class_masks])

        # Remove the majority class from the second dimension of new_y
        new_y = np.delete(new_y, majority_class_index, axis=1)

        return new_x, new_y