-
Notifications
You must be signed in to change notification settings - Fork 4
/
paper_tables.py
175 lines (122 loc) · 6.83 KB
/
paper_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pandas as pd
import numpy as np
import math
import re
custom_order = {"clevercs": 0, "csvcommons": 1, "rhypoparsr": 2, "opencsv": 3, "pandas": 4, "pycsv": 5, "rcsv": 6, "univocity": 7, "mariadb": 8,
"mysql": 9, "postgres": 10, "sqlite": 11, "libreoffice": 12, "spreaddesktop": 13, "spreadweb": 14, "dataviz": 15}
SUTS = custom_order.keys()
def round_down(n, decimals=2):
multiplier = 10 ** decimals
return math.floor(n * multiplier) / float(multiplier)
# Get the loading times
def get_loading_times(sut, dataset, dataframe=None):
# Check if dataframe is None, if not, read csv file
sut_time_df = dataframe if dataframe is not None else pd.read_csv(f"results/{sut}/{dataset}/{sut}_time.csv")
all_times = sut_time_df[sut_time_df.columns[1:]].values.flatten()
# Calculate the benchmark_time by calculating the mean (average) of the loading times
benchmark_time = all_times.mean()
# Calculate the square root of the variance of the loading times
benchmark_time_std = np.sqrt(np.var(all_times))
return f'{round(benchmark_time * 1000, 2)} +- {round(benchmark_time_std * 1000, 2)}'
def generate_table_5():
# Read the csv file
df = pd.read_csv('results/global_results_polluted_files.csv')
headers = ['S', 'HF1', 'RF1', 'CF1', 'Loading time (ms)']
# Create a table that has the sut as index and the headers as columns. Sut gotten from the custom order
table = pd.DataFrame(columns=headers, index=custom_order.keys())
# Get from df the row where file is "source.csv"
source_row = df.loc[df['file'] == 'source.csv']
# Fill the table
for sut in custom_order.keys():
# Only add the ones where success, header_f1, record_f1 and cell_f1 are not 1
if source_row[f'{sut}_success'].values[0] == 1 and source_row[f'{sut}_header_f1'].values[0] == 1 and source_row[f'{sut}_record_f1'].values[0] == 1 and source_row[f'{sut}_cell_f1'].values[0] == 1:
# Remove the sut from the table
table = table.drop(sut)
continue
table.loc[sut, 'S'] = round_down(source_row[f'{sut}_success'].values[0])
table.loc[sut, 'HF1'] = round_down(source_row[f'{sut}_header_f1'].values[0])
table.loc[sut, 'RF1'] = round_down(source_row[f'{sut}_record_f1'].values[0])
table.loc[sut, 'CF1'] = round_down(source_row[f'{sut}_cell_f1'].values[0])
# Create a dataframe with just the row where file is "source.csv"
source_row_time_df = pd.read_csv(f"results/{sut}/polluted_files/{sut}_time.csv")
source_row_time_df = source_row_time_df.loc[source_row_time_df['filename'] == 'source.csv']
# Get the loading times
table.loc[sut, 'Loading time (ms)'] = get_loading_times(sut=sut, dataset='polluted_files', dataframe=source_row_time_df)
return table
def generate_table_6():
df = pd.read_csv('results/global_results_polluted_files.csv')
df.set_index('file', inplace=True)
headers = ['File and table pollution',
'Inconsistent number of delimiters',
'Structural character change']
subheaders = ['S', 'HF1', 'RF1', 'CF1']
rexes = ["file_double.*|file_header.*|file_no.*|file_one.*|file_multi.*|file_preamble.*",
"%row_less.*|row_more",
"file_field.*|row_field.*|file_quote.*|file_record_delimiter.*|row_extra_quote.*|file_escape.*"]
metrics = ['success', 'header_f1', 'record_f1', 'cell_f1']
# Create the table index (with the headers and subheaders), append the pollock score and loading time
table_header = pd.MultiIndex.from_product([headers, subheaders])
table_header = table_header.insert(len(table_header), ('Pollock score', 'Simple'))
table_header = table_header.insert(len(table_header), ('Pollock score', 'Weighted'))
table_header = table_header.insert(len(table_header), ('Loading time (ms)', ''))
table = pd.DataFrame(columns=table_header, index=SUTS)
for sut in SUTS:
for idx,header in enumerate(headers):
rx = rexes[idx]
files = [f for f in df.index if re.search(rx,f)]
means = df.loc[files].mean()
for jdx, metric in enumerate(metrics):
value = means[sut + "_" + metric]
table.loc[sut, (header, subheaders[jdx])] = round_down(value)
pollock_simple = sum(df.mean().loc[[c for c in df.columns if sut in c]])
table.loc[sut, ("Pollock score", "Simple")] = round_down(pollock_simple)
partial_mean = df[[c for c in df.columns if sut in c]].sum(axis=1) * df["normalized_weight"]
pollock_weighted = sum(partial_mean)
table.loc[sut, ("Pollock score", "Weighted")] = round_down(pollock_weighted)
loading_time = get_loading_times(sut=sut, dataset='polluted_files')
table.loc[sut, ("Loading time (ms)", "")] = loading_time
return table
def generate_table_7():
df = pd.read_csv('results/global_results_survey_sample.csv')
df.set_index('file', inplace=True)
headers = ['S', 'HF1', 'RF1', 'CF1', 'Po.', 'Loading time (ms)']
metrics = ['success', 'header_f1', 'record_f1', 'cell_f1', 'pollock_simple', 'loading_time']
table = pd.DataFrame(columns=headers, index=SUTS)
for sut in SUTS:
for idx,header in enumerate(headers):
metric = metrics[idx]
if metric == 'loading_time':
value = get_loading_times(sut=sut, dataset='survey_sample')
elif metric == 'pollock_simple':
pollock_simple = sum(df.mean().loc[[c for c in df.columns if sut in c]])
value = round_down(pollock_simple)
else:
means = df.mean()
value = means[sut + "_" + metric]
value = round_down(value)
table.loc[sut, header] = value
return table
if __name__ == "__main__":
table_5 = generate_table_5()
print('---------- Table 5: Systems with imperfect loading of the source file (RFC4180 compliant): success and F1-scores. ----------')
print()
print(table_5)
print()
table_6 = generate_table_6()
print('---------- Table 6: Pollock results (rounding down) of the 16 systems under test, grouped by pollution type. ----------')
print('\n\tFile and table pollution (12 files)')
print(table_6['File and table pollution'])
print('\n\tInconsistent number of delimiters (1428 files)')
print(table_6['Inconsistent number of delimiters'])
print('\n\tStructural character change (850 files)')
print(table_6['Structural character change'])
print('\n\tPollock score (2289 +1 files) and Average file-wise time (ms)')
print(table_6[['Pollock score', 'Loading time (ms)']])
print()
# Uncomment for full table
# print(table_6)
table_7 = generate_table_7()
print('---------- Table 7: Results on a sample of 100 files from our survey. ----------')
print()
print(table_7)
print()