Skip to content

Commit

Permalink
added methods to generate stats and to evaluate the availability
Browse files Browse the repository at this point in the history
  • Loading branch information
GabrieleT0 committed Sep 21, 2024
1 parent 3a28687 commit b2389ae
Showing 1 changed file with 136 additions and 16 deletions.
152 changes: 136 additions & 16 deletions lodc_quality_evaluation/punctual_quality_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
import os
import csv
import ast
import requests
from xml.etree import ElementTree

class PunctualQualityEvaluation:
def __init__(self, analysis_file_path):
def __init__(self, analysis_file_path,separator = ','):
'''
Loads the contents of the csv file containing the analysis data into memory.
:param analysis_file_path: Path to the file that contains the quality data to be evaluated
:param separator: separator used in the analysis file (by default is ',')
'''
self.analysis_data = pd.read_csv(analysis_file_path)
self.analysis_data = pd.read_csv(analysis_file_path,sep=separator)

def group_by_value(self,metric):
'''
Expand Down Expand Up @@ -42,7 +45,6 @@ def count_elements_by_type(self,metric):
except Exception as error:
#print(error)
continue

df = pd.DataFrame(values.items())
self.write_data_on_csv('serial',df,False)

Expand All @@ -51,16 +53,16 @@ def accessibility_stats(self):
Evaluate accessibility metrics.
'''

all_up = self.analysis_data[(self.analysis_data['Sparql endpoint'] == 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] == 1) & (self.analysis_data['Availability VoID file'] == 'VoID file available')].shape[0]
all_down = self.analysis_data[(self.analysis_data['Sparql endpoint'] != 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] != 1) & (self.analysis_data['Availability VoID file'] != 'VoID file available')].shape[0]
only_sparql = self.analysis_data[(self.analysis_data['Sparql endpoint'] == 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] != 1) & (self.analysis_data['Availability VoID file'] != 'VoID file available')].shape[0]
only_dump = self.analysis_data[(self.analysis_data['Sparql endpoint'] != 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] == 1) & (self.analysis_data['Availability VoID file'] != 'VoID file available')].shape[0]
only_void = self.analysis_data[(self.analysis_data['Sparql endpoint'] != 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] != 1) & (self.analysis_data['Availability VoID file'] == 'VoID file available')].shape[0]
sparql_dump = self.analysis_data[(self.analysis_data['Sparql endpoint'] == 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] == 1) & (self.analysis_data['Availability VoID file'] != 'VoID file available')].shape[0]
sparql_void = self.analysis_data[(self.analysis_data['Sparql endpoint'] == 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] != 1) & (self.analysis_data['Availability VoID file'] == 'VoID file available')].shape[0]
dump_void = self.analysis_data[(self.analysis_data['Sparql endpoint'] != 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] == 1) & (self.analysis_data['Availability VoID file'] == 'VoID file available')].shape[0]
sparql_dump_down = self.analysis_data[(self.analysis_data['Sparql endpoint'] != 'Available') & (self.analysis_data['Availability of RDF dump (metadata)'] != 1)].shape[0]
sparql_or_dump_UP = self.analysis_data[(self.analysis_data['Sparql endpoint'] == 'Available') | (self.analysis_data['Availability of RDF dump (metadata)'] == 1)].shape[0]
all_up = self.analysis_data[(self.analysis_data['SPARQL availability'] == True) & (self.analysis_data['RDF dump availability'] == True) & (self.analysis_data['VoID availability'] == 'VoID file available')].shape[0]
all_down = self.analysis_data[(self.analysis_data['SPARQL availability'] != True) & (self.analysis_data['RDF dump availability'] != True) & (self.analysis_data['VoID availability'] != 'VoID file available')].shape[0]
only_sparql = self.analysis_data[(self.analysis_data['SPARQL availability'] == True) & (self.analysis_data['RDF dump availability'] != True) & (self.analysis_data['VoID availability'] != 'VoID file available')].shape[0]
only_dump = self.analysis_data[(self.analysis_data['SPARQL availability'] != True) & (self.analysis_data['RDF dump availability'] == True) & (self.analysis_data['VoID availability'] != 'VoID file available')].shape[0]
only_void = self.analysis_data[(self.analysis_data['SPARQL availability'] != True) & (self.analysis_data['RDF dump availability'] != True) & (self.analysis_data['VoID availability'] == 'VoID file available')].shape[0]
sparql_dump = self.analysis_data[(self.analysis_data['SPARQL availability'] == True) & (self.analysis_data['RDF dump availability'] == True) & (self.analysis_data['VoID availability'] != 'VoID file available')].shape[0]
sparql_void = self.analysis_data[(self.analysis_data['SPARQL availability'] == True) & (self.analysis_data['RDF dump availability'] != True) & (self.analysis_data['VoID availability'] == 'VoID file available')].shape[0]
dump_void = self.analysis_data[(self.analysis_data['SPARQL availability'] != True) & (self.analysis_data['RDF dump availability'] == True) & (self.analysis_data['VoID availability'] == 'VoID file available')].shape[0]
sparql_dump_down = self.analysis_data[(self.analysis_data['SPARQL availability'] != True) & (self.analysis_data['RDF dump availability'] != True)].shape[0]
sparql_or_dump_UP = self.analysis_data[(self.analysis_data['SPARQL availability'] == True) | (self.analysis_data['RDF dump availability'] == True)].shape[0]

result = {
"SPARQL, Dump and VoID online" : all_up,
Expand All @@ -86,7 +88,125 @@ def write_data_on_csv(self, metric, pandas_df,index=True):
:param metric: The name of the metric evaluated, used as filename.
:param pandas_df: pandas df to write in the csv file.
'''
pandas_df.to_csv(f'{metric}_evaluation-punctual.csv',index=index)
here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/{metric}_evaluation-punctual.csv')
pandas_df.to_csv(save_path,index=index)

c = PunctualQualityEvaluation('./quality_data/2024-09-01.csv')
v = c.accessibility_stats()
def compare_column(self,column_to_compare,sparql_av=False):
'''
Extract the value of n columns to compare the values
:param column_to_compare: array of strings, each string corresponds to the name of the column you want to extract from the dataframe.
:param sparql_av: if set to true, only KGs with active sparql endpoint will be considered in the comparison
'''
if sparql_av == True:
filtered_df = self.analysis_data[self.analysis_data["Sparql endpoint"] == "Available"]
partitionated_df = filtered_df[column_to_compare]
else:
partitionated_df = self.analysis_data[column_to_compare]

self.write_data_on_csv(f'Comparison-column_{column_to_compare}',partitionated_df,index=False)

def get_kgs_available_with_license(self):
'''
Extract KGs that have at least one SPARQL endpoint, RDF dump or VoID file available along with a license.
'''

df = self.analysis_data[
(
(self.analysis_data['SPARQL availability'] == True) |
(self.analysis_data['RDF dump availability'] == True) |
(self.analysis_data['VoID availability'] == 'VoID file available')
) &
(
(self.analysis_data['License (metadata)'] != '-') |
(self.analysis_data['License machine redeable (query)'] != '-')
)
]

self.write_data_on_csv('availability_and_license',df)

def check_machine_redeable_resolution(self,links):
'''
Check if the link return a machine-redeable common accepted format.
:param links: list of links to run the test on.
'''
headers_list = [
{'Accept': 'application/json'}, # JSON
{'Accept': 'application/xml'}, # XML
{'Accept': 'application/rdf+xml'}, # RDF/XML
]

for link in links:
for headers in headers_list:
try:
response = requests.get(link, headers=headers)
if response.status_code == 200:

content_type = response.headers.get('Content-Type', '').lower()

if 'application/json' in content_type:
try:
content = response.json()
print(f"JSON available for {link}")
except ValueError:
continue

elif 'application/xml' in content_type or 'text/xml' in content_type:
try:
content = ElementTree.fromstring(response.content)
print(f"XML available for {link}")
except ElementTree.ParseError:
continue

elif 'application/rdf+xml' in content_type:
try:
content = ElementTree.fromstring(response.content)
print(f"RDF available for {link}")
except ElementTree.ParseError:
continue

elif 'text/html' in content_type:
print(f"ONLY HTML for {link}")
else:
print(f"Errore during request: {response.status_code}")
except requests.RequestException as e:
print(f"Errore durante la richiesta con header {headers}: {e}")

def generate_stats(self,metrics,output_filename,only_sparql_up=True):
data = []
data.append(['Dimension', 'Min', 'Q1', 'Median', 'Q3', 'Max', 'Mean'])
for metric in metrics:
#Exclude KG with SPARQL endpoint offline or not indicated
if only_sparql_up:
df = self.analysis_data[(self.analysis_data["Sparql endpoint"] == "Available")]
else:
df = self.analysis_data

df.loc[:,metric] = pd.to_numeric(df[metric], errors='coerce')
min_value = df[metric].min()
q1_value = df[metric].quantile(0.25)
median_value = df[metric].median()
q3_value = df[metric].quantile(0.75)
max_value = df[metric].max()
mean_value = df[metric].mean()

if metric == 'Representational-Consistency score':
metric = 'Interoperability'
if metric == 'Representational-Conciseness score':
metric = 'Rep.-Conc.'
if metric == 'Understandability score':
metric = 'Underst.'
evaluation = [metric.split(' ')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
data.append(evaluation)

here = os.path.dirname(os.path.abspath(__file__))
save_path = os.path.join(here,f'./evaluation_results/punctual/{output_filename}.csv')
with open(save_path, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(data)

c = PunctualQualityEvaluation('./quality_data/2024-09-08.csv')
c.generate_stats(['Availability score','Licensing score','Interlinking score','Performance score','Accuracy score','Consistency score','Conciseness score',
'Verifiability score','Reputation score','Believability score','Currency score','Volatility score','Completeness score','Amount of data score','Representational-Consistency score','Representational-Conciseness score',
'Understandability score','Interpretability score','Versatility score','Security score'],'metrics_stats_punctual')

0 comments on commit b2389ae

Please sign in to comment.