-
Notifications
You must be signed in to change notification settings - Fork 0
/
aggregator_weighting.py
108 lines (82 loc) · 4.47 KB
/
aggregator_weighting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Created on Sun May 2 12:13:52 2021
@author: Harry Jackson
Functions and code to score prediction performance and calculate node weightings using RMSE
"""
#Imports
import pandas as pd
import numpy as np
"""
Reformats the inputted predictions and actuals from wide format to long
Args:
predictions (dataframe): This is the nodes predictions for the labels by content hash in the format -[content_hash, node_identifier, adult, suggestive,
violence, visually_disturbing, hate_symbols].
actuals: (dataframe): This is the actual values for the labels -[content_hash, adult, suggestive,
violence, visually_disturbing, hate_symbols].
Returns (dataframe, dataframe):
Two dataframes that have been pivoted from wide to long
"""
def wide_to_long(predictions, actuals):
#turn into long format
long_predictions = pd.melt(predictions,
id_vars=['content_hash','node_identifier'],
value_vars=['adult','suggestive','violence','visually_disturbing','hate_symbols'],
var_name = "label",
value_name = "prediction")
long_actuals = pd.melt(actuals,
id_vars=['content_hash'],
value_vars=['adult','suggestive','violence','visually_disturbing','hate_symbols'],
var_name = "label",
value_name = "actual")
return(long_predictions, long_actuals)
"""
Calculates the RMSE of a node. The error is the difference betweent the prediction and the actual values by label.
To calculate this we square the individual errors, take the mean by node/label, then square root the result.
This will give us our error %, by taking this from 100 we can thus calculate our accuracy.
This value does not need standardising as the max of a label is 100 and the min is 0.
Thus the max overall accuracy of a node is 100 and the minimum is 0.
Args:
errors (dataframe): A dataframne with cols [node_identifier, label, error]
Returns (dataframe):
A dataframe with the RMSE accuracy calculated by node/label
"""
def calculate_RMSE(errors):
errors["squared_error"] = errors["error"]**2
#calculate the mean of the squared erros by node/label
results = errors.groupby(["node_identifier", 'label']).mean()
results['RMSE'] = results['squared_error'] **0.5
results['accuracy'] = 100 - results['RMSE']
return(results.reset_index())
"""
Takes the accuracy scores by node and label and will calculate the weighting each node should recieve by normalising these accuracies.
Args:
results (dataframe): The dataframe to normalise the accuracies across to provide a weighting, in the format - [node_identifier, label, accuracy]
Returns (dataframe):
A dataframe of weights by node containing the weighting a node should be given in the aggregation.
"""
def agg_normalise(results):
weights = results[["node_identifier", "label", "accuracy"]].copy()
#replace na with 0
weights['accuracy'] = weights['accuracy'].fillna(0)
weights['total'] = weights.groupby("label")["accuracy"].transform('sum')
weights['weighting'] = weights['accuracy']/weights['total']
weights = weights.drop(['total'], axis = 1)
return(weights)
#Main script
#Load in training and testing data
prediction_data = pd.read_csv('predictions_data.csv',dtype={0:'str',1:'str',2:np.float64,3:np.float64,4:np.float64,5:np.float64,6:np.float64})
actual_data = pd.read_csv('actuals_data.csv',dtype={0:'str',1:np.float64,2:np.float64,3:np.float64,4:np.float64,5:np.float64})
#Pivot data to long format
long_prediction, long_actual = wide_to_long(prediction_data, actual_data)
#Join actuals to predictions based upon content hash
combined = long_prediction.merge(right=long_actual, how = 'inner', left_on = ['content_hash', 'label'], right_on = ['content_hash', 'label'])
#Calculate error between prediction and actual
combined["error"] = combined["prediction"] - combined["actual"]
#Calculate root mean squared error from error and actuals
results = calculate_RMSE(combined)
#Normalise RMSE across nodes to return a normalised weighting
weights = agg_normalise(results)
print("WEIGHTS")
print(weights)
#Save to CSV
weights.to_csv("node_weightings.csv")