-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlocal_explanation.py
229 lines (193 loc) · 12.3 KB
/
local_explanation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from . import perturbation as pe
from . import feature_extraction as fe
from . import perturbation_scores
import numpy as np
class LocalExplanationManager:
""" Local Explanation Phase Class.
The LocalExplanationPhase class manages the workflow of the local explanation phase in the explanation process.
Attributes:
"""
def __init__(self, preprocessed_text, model_wrapper, class_of_interest, perturbations):
"""
Args:
preprocessed_text (str): String created by tokenizing and joining tokens (can contains <OOV> tokens)
model_wrapper (:obj:ModelWrapperInterface) Instance of a real class implementing the ModelWrapperInterface
class_of_interest (int):
perturbations (list[:obj:Perturbation]): List of perturbations to which create local explanations
"""
self.input_text = preprocessed_text
self.model_wrapper = model_wrapper
self.class_of_interest = class_of_interest
self.perturbations = perturbations
self.original_probabilities = None
self.original_label = None
self.local_explanations = []
return
def execute_local_explanation_phase(self):
""" Execute the Local Explanation Phase. """
# Create a list of texts: first element is the original text, the others are the perturbed texts
texts = [self.input_text]
for perturbation in self.perturbations:
texts.append(perturbation.perturbed_text)
# Predict output probabilities for original text and perturbed texts
predictions = self.model_wrapper.predict(texts)
# Pop from the list the first element corresponding to the probabilities of the original text
self.original_probabilities = predictions[0]
self.original_label = self.model_wrapper.get_label_list()[np.argmax(self.original_probabilities)]
# If the class_of_interest is `-1` then assign to it the max predicted class label from the original text
if self.class_of_interest == -1:
coi = self.original_label
else:
coi = self.class_of_interest
local_explanation_id = 0
for prediction, perturbation in zip(predictions[1:], self.perturbations):
perturbed_probabilities = prediction
try:
# Compute the scores for the `class_of_interest` -> How much the current feature is influential for the `class_of_interest`
ps_coi = perturbation_scores.PerturbationScores(self.original_probabilities, perturbed_probabilities, coi)
ps_coi.compute_scores()
nPIR_class_of_interest = ps_coi.nPIR
nPIRP_class_of_interest = ps_coi.nPIRP
except:
print("An exception has occurred while calculating nPIR and nPIRP.")
nPIR_class_of_interest = round(0, 3)
nPIRP_class_of_interest = round(1, 3)
# original_top_class is the most likely predicted label on the Original text
original_top_class = self.model_wrapper.get_label_list()[np.argmax(self.original_probabilities)]
# perturbed_top_class is the most likely predicted label on the Perturbed text
perturbed_top_class = self.model_wrapper.get_label_list()[np.argmax(perturbed_probabilities)]
# Check if the original predicted class is different from the class of interest
if original_top_class != coi:
try:
# Compute the scores for the `original_top_class` -> How much the current feature is influential for the `original_top_class`
ps_original_top_class = perturbation_scores.PerturbationScores(self.original_probabilities, perturbed_probabilities,
original_top_class)
ps_original_top_class.compute_scores()
nPIR_original_top_class = ps_original_top_class.nPIR
nPIRP_original_top_class = ps_original_top_class.nPIRP
except:
print("An exception has occurred while calculating nPIR and nPIRP.")
nPIR_original_top_class = round(0, 3)
nPIRP_original_top_class = round(1, 3)
else:
# If the original top class is the same of the class of interests, then the scores are already computed
nPIR_original_top_class = nPIR_class_of_interest
nPIRP_original_top_class = nPIRP_class_of_interest
nPIRs = []
nPIRPs = []
# Calculate the scores for each possible class label -> How much the current feature is influential for each class label
for class_label in self.model_wrapper.get_label_list():
try:
ps = perturbation_scores.PerturbationScores(self.original_probabilities, perturbed_probabilities, class_label)
ps.compute_scores()
nPIR = ps.nPIR
nPIRP = ps.nPIRP
except:
print("An exception has occurred while calculating nPIR and nPIRP.")
nPIR = round(0, 3)
nPIRP = round(1, 3)
nPIRs.append(nPIR)
nPIRPs.append(nPIRP)
# Create an instance of the NumericalExplanation
numerical_explanation = NumericalExplanation(nPIR_original_top_class, # nPIR obtained by the current feature on the original predicted class
nPIRP_original_top_class, # nPIRP obtained by the current feature on the original predicted class
nPIR_class_of_interest, # nPIR obtained by the current feature on the class of interest
nPIRP_class_of_interest, # nPIRP obtained by the current feature on the class of interest
nPIRs, # List of all nPIR obtained by the current feature on each class label
nPIRs) # List of all nPIRP obtained by the current feature on each class label
# Create an instance of the LocalExplanation
local_explanation = LocalExplanation()
local_explanation.fit(local_explanation_id, # Unique identifier of the local explanation for each input text
perturbation, # Perturbation instance (each perturbation contains a single Feature instance)
self.original_probabilities.tolist(), # probabilities predicted by the model on the original text
perturbed_probabilities.tolist(), # probabilities predicted by the model on the perturbed version of text
original_top_class, # Most likely class predicted by the model on the original text
perturbed_top_class, # Most likely class predicted by the model on the perturbed text
coi, # Class of interest
numerical_explanation) # Instance of the numerical explanation of the current feature
self.local_explanations.append(local_explanation)
local_explanation_id += 1
return self.local_explanations, self.original_probabilities, self.original_label
class LocalExplanation:
def __init__(self):
self.local_explanation_id = None
self.perturbation = None
self.original_probabilities = None
self.perturbed_probabilities = None
self.original_top_class = None
self.perturbed_top_class = None
self.class_of_interest = None
self.numerical_explanation = None
return
def fit(self, local_explanation_id, perturbation, original_probabilities, perturbed_probabilities,
original_top_class, perturbed_top_class, class_of_interest, numerical_explanation):
self.local_explanation_id = local_explanation_id
self.perturbation = perturbation
self.original_probabilities = original_probabilities
self.perturbed_probabilities = perturbed_probabilities
self.original_top_class = original_top_class
self.perturbed_top_class = perturbed_top_class
self.class_of_interest = class_of_interest
self.numerical_explanation = numerical_explanation
return
def fit_from_dict(self, local_explanation_dict):
self.local_explanation_id = local_explanation_dict["local_explanation_id"]
self.original_probabilities = local_explanation_dict["original_probabilities"]
self.perturbed_probabilities = local_explanation_dict["perturbed_probabilities"]
self.original_top_class = local_explanation_dict["original_top_class"]
self.perturbed_top_class = local_explanation_dict["perturbed_top_class"]
self.class_of_interest = local_explanation_dict["class_of_interest"]
feature = fe.Feature(local_explanation_dict["feature_id"],
local_explanation_dict["feature_type"],
local_explanation_dict["feature_description"],
local_explanation_dict["positions_tokens"],
local_explanation_dict["combination"])
self.perturbation = pe.Perturbation(local_explanation_dict["perturbation_id"],
local_explanation_dict["perturbation_type"],
local_explanation_dict["perturbed_text"],
feature)
self.numerical_explanation = NumericalExplanation(local_explanation_dict["nPIR_original_top_class"],
local_explanation_dict["nPIRP_original_top_class"],
local_explanation_dict["nPIR_class_of_interest"],
local_explanation_dict["nPIRP_class_of_interest"],
local_explanation_dict["nPIRs"],
local_explanation_dict["nPIRPs"])
return
def local_explanation_to_dict(self):
""" Converts a single local explanation into dictionary. """
perturbation = self.perturbation
feature = perturbation.feature
local_explanation_dict = {
"local_explanation_id": self.local_explanation_id,
"feature_id": feature.feature_id,
"feature_type": feature.feature_type,
"feature_description": feature.description,
"positions_tokens": feature.positions_tokens,
"combination": feature.combination,
"perturbation_id": perturbation.perturbation_id,
"perturbation_type": perturbation.perturbation_type,
"perturbed_text": perturbation.perturbed_text,
"original_probabilities": self.original_probabilities,
"perturbed_probabilities": self.perturbed_probabilities,
"original_top_class": self.original_top_class,
"perturbed_top_class": self.perturbed_top_class,
"class_of_interest": self.class_of_interest,
"nPIR_original_top_class": self.numerical_explanation.nPIR_original_top_class,
"nPIRP_original_top_class": self.numerical_explanation.nPIRP_original_top_class,
"nPIR_class_of_interest": self.numerical_explanation.nPIR_class_of_interest,
"nPIRP_class_of_interest": self.numerical_explanation.nPIRP_class_of_interest,
"nPIRs": self.numerical_explanation.nPIRs,
"nPIRPs": self.numerical_explanation.nPIRPs,
"k": self.perturbation.feature.k
}
return local_explanation_dict
class NumericalExplanation:
def __init__(self, nPIR_original_top_class, nPIRP_original_top_class, nPIR_class_of_interest,
nPIRP_class_of_interest, nPIRs, nPIRPs):
self.nPIR_original_top_class = nPIR_original_top_class
self.nPIRP_original_top_class = nPIRP_original_top_class
self.nPIR_class_of_interest = nPIR_class_of_interest
self.nPIRP_class_of_interest = nPIRP_class_of_interest
self.nPIRs = nPIRs
self.nPIRPs = nPIRPs
return