Skip to content

Commit

Permalink
added method to calculate the quality score of a category of dimensions
Browse files Browse the repository at this point in the history
  • Loading branch information
GabrieleT0 committed Sep 21, 2024
1 parent ebadfeb commit 3a28687
Showing 1 changed file with 125 additions and 22 deletions.
147 changes: 125 additions & 22 deletions lodc_quality_evaluation/quality_evaluation_over_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import ast

class QualityEvaluationOT:
def __init__(self,analysis_results_path,output_file):
def __init__(self,analysis_results_path,output_file='evaluation-over-time'):
'''
Creates a list of CSV files that are to be parsed
Expand All @@ -20,6 +20,8 @@ def __init__(self,analysis_results_path,output_file):
file_path = os.path.join(analysis_results_path, filename)
self.analysis_results_files.append(file_path)

self.analysis_results_files = sorted(self.analysis_results_files)

def load_all_csv_as_one(self,metrics_to_select):
'''
Load all csv file in memory as one dataframe.
Expand Down Expand Up @@ -55,25 +57,48 @@ def extract_only_lodc(self,analysis_results_path):
df_filtered = df[df['KG id'].isin(identifiers)]

df_filtered.to_csv(f"filtered/{filename}.csv",index=False)

def stats_over_time(self, metrics):

def extract_only_lodc_single_file(self,analysis_results_path):
'''
Extract only KGs from LODCloud from the csv output from KGHeartBeat.
:param analysis_results_path: path to csv where to discard the KGs.
'''
response = requests.get("https://lod-cloud.net/versions/latest/lod-data.json")
kgs = response.json()
print("Number of KG from LODCloud:", len(kgs))
identifiers = [data['identifier'] for key, data in kgs.items()]
if '.csv' in analysis_results_path:
df = pd.read_csv(analysis_results_path)

identifiers_in_csv = set(df['KG id'].unique())
missing_identifiers = set(identifiers) - identifiers_in_csv
print("Missing KGs from KGHeartBeat analysis: ", missing_identifiers)

df['KG id'] = df['KG id'].astype(str).str.strip()
df_filtered = df[df['KG id'].isin(identifiers)]

df_filtered.to_csv(f"filtered/{analysis_results_path}.csv",index=False)

def stats_over_time(self, metrics,only_sparql_up=True):
'''
For every analysis, calculate the min, max, median, mean, q1, q3 for the specified metrics by considering all KGs in the file.
Then the data are stored in a csv file
:param metrics: string array that contains the exact column name of the csv file for which you want to enter statistics
:param sparql_availability: boolean if true, consider in statistics, only KGs with an active SPARQL endpoint, if false, all will be considered.
'''
data = []
# loop through every file and calculate data for a boxplot
for metric in metrics:
data = []
print(f"Evaluating the {metric} metric\n")
data.append([metric])
data.append(['Analysis date', 'Min', 'Q1', 'Median', 'Q3', 'Max', 'Mean'])
for file_path in self.analysis_results_files:
df = pd.read_csv(file_path)

#Exclude KG with SPARQL endpoint offline or not indicated
df = df[(df["Sparql endpoint"] == "Available")]
if(only_sparql_up == True):
df = df[(df["Sparql endpoint"] == "Available")]

df[metric] = pd.to_numeric(df[metric], errors='coerce')
min_value = df[metric].min()
Expand All @@ -86,10 +111,85 @@ def stats_over_time(self, metrics):
evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
data.append(evaluation)

with open(f'{self.output_file}.csv', mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)
here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/over_time/{metric}.csv')
with open(save_path, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)

def convert_to_category(self,only_sparql_up = True):
'''
Creates a quality score divided by category of dimension, taking the average score obtained from the dimension for each measurement.
_param results_path: path to the folder in which there are all the CSV file that containing the quality evaluation data.
'''
evaluation_results = []
category = {
"Intrinsic" : {
"Accuracy score" : 0,
"Interlinking score" : 0,
"Consistency score" : 0,
"Conciseness score" : 0,
},
"Datasey dynamicity" : {
"Currency score" : 0,
"Volatility score" : 0,
},
"Trust" : {
"Verifiability score" : 0,
"Reputation score" : 0,
"Believability score" : 0,
},
"Contextual" : {
"Completeness score" : 0,
"Amount of data score" : 0,
},
"Representational" : {
"Representational-Consistency score": 0,
"Representational-Conciseness score" : 0,
"Understandability score" : 0,
"Interpretability score" : 0,
"Versatility score" : 0
},
"Accessibility": {
"Availability score" : 0,
"Licensing score" : 0,
"Security score" : 0,
"Performance score" : 0,
}
}

for key in category:
print(f"Evaluating the {key} category")
data = []
data.append(['Analysis date', 'Mean score'])
for file_path in self.analysis_results_files:
df = pd.read_csv(file_path)
for dimension in category[key]:

if(only_sparql_up == True):
df = df[(df["Sparql endpoint"] == "Available")]

df[dimension] = pd.to_numeric(df[dimension], errors='coerce')
mean_value = df[dimension].mean()

category[key][dimension] = mean_value

values_in_category = []
for dimension in category[key]:
mean_score = category[key][dimension]
values_in_category.append(mean_score)
category_score = sum(values_in_category) / len(values_in_category)

evaluation = [os.path.basename(file_path).split('.')[0], category_score]
data.append(evaluation)

here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/over_time/by_category/{key}.csv')
with open(save_path, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)

def evaluate_provenance_info(self):
'''
Evaluate the provenance metrics by checking if an author or a publisher is indicated in the KG.
Expand All @@ -111,14 +211,16 @@ def evaluate_provenance_info(self):

evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
data.append(evaluation)

with open(f'{self.output_file}.csv', mode='a', newline='') as file:

here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/{self.output_file}.csv')
with open(save_path, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)

def evaluate_integer_metrics(self,metric,new_column_name):
'''
Evaluates the quality of metrics that have integers as their value.
Evaluates the quality of metrics that have list as their value.
:param metric the metric name to evaluate.
:param new_column_name the column name in which insert the number of elements in the measured meatric.
Expand Down Expand Up @@ -147,7 +249,9 @@ def evaluate_integer_metrics(self,metric,new_column_name):
evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
data.append(evaluation)

with open(f'{self.output_file}.csv', mode='a', newline='') as file:
here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/{self.output_file}.csv')
with open(save_path, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)

Expand All @@ -174,16 +278,15 @@ def evaluate_conciseness(self):

evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
data.append(evaluation)

with open(f'{self.output_file}.csv', mode='a', newline='') as file:

here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/{self.output_file}.csv')
with open(save_path, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)





q = QualityEvaluationOT('./quality_data','quality_evaluation_over_time')
#q.stats_over_time(['Deprecated classes/properties used','Invalid usage of inverse-functional properties','','Entities as member of disjoint class'])
#q.evaluate_provenance_info()
q.evaluate_conciseness()
#q.stats_over_time(['Availability score','Licensing score','Interlinking score','Performance score','Accuracy score','Consistency score','Conciseness score',
# 'Verifiability score','Reputation score','Believability score','Currency score','Volatility score','Completeness score','Amount of data score','Representational-Consistency score','Representational-Conciseness score',
# 'Understandability score','Interpretability score','Versatility score','Security score'])
q.convert_to_category()

0 comments on commit 3a28687

Please sign in to comment.