added method to calculate the quality score of a category of dimensions

isislab-unisa · Sep 21, 2024 · 3a28687 · 3a28687
1 parent ebadfeb
commit 3a28687
Showing 1 changed file with 125 additions and 22 deletions.
diff --git a/lodc_quality_evaluation/quality_evaluation_over_time.py b/lodc_quality_evaluation/quality_evaluation_over_time.py
@@ -5,7 +5,7 @@
 import ast
 
 class QualityEvaluationOT:
-    def __init__(self,analysis_results_path,output_file):
+    def __init__(self,analysis_results_path,output_file='evaluation-over-time'):
         '''
             Creates a list of CSV files that are to be parsed
 
@@ -20,6 +20,8 @@ def __init__(self,analysis_results_path,output_file):
                 file_path = os.path.join(analysis_results_path, filename)
                 self.analysis_results_files.append(file_path)
 
+        self.analysis_results_files = sorted(self.analysis_results_files)
+
     def load_all_csv_as_one(self,metrics_to_select):
         '''
             Load all csv file in memory as one dataframe.
@@ -55,25 +57,48 @@ def extract_only_lodc(self,analysis_results_path):
                 df_filtered = df[df['KG id'].isin(identifiers)]
 
                 df_filtered.to_csv(f"filtered/{filename}.csv",index=False)
-
-    def stats_over_time(self, metrics):   
+
+    def extract_only_lodc_single_file(self,analysis_results_path):
+        '''
+            Extract only KGs from LODCloud from the csv output from KGHeartBeat.
+
+            :param analysis_results_path: path to csv where to discard the KGs.
+        '''
+        response = requests.get("https://lod-cloud.net/versions/latest/lod-data.json")
+        kgs = response.json()
+        print("Number of KG from LODCloud:", len(kgs))
+        identifiers = [data['identifier'] for key, data in kgs.items()]
+        if '.csv' in analysis_results_path:
+            df = pd.read_csv(analysis_results_path)
+
+            identifiers_in_csv = set(df['KG id'].unique())
+            missing_identifiers = set(identifiers) - identifiers_in_csv
+            print("Missing KGs from KGHeartBeat analysis: ", missing_identifiers)
+
+            df['KG id'] = df['KG id'].astype(str).str.strip()
+            df_filtered = df[df['KG id'].isin(identifiers)]
+
+            df_filtered.to_csv(f"filtered/{analysis_results_path}.csv",index=False)
+
+    def stats_over_time(self, metrics,only_sparql_up=True):   
         '''
             For every analysis, calculate the min, max, median, mean, q1, q3 for the specified metrics by considering all KGs in the file.
             Then the data are stored in a csv file
 
             :param metrics: string array that contains the exact column name of the csv file for which you want to enter statistics
+            :param sparql_availability: boolean if true, consider in statistics, only KGs with an active SPARQL endpoint, if false, all will be considered.
         '''
-        data = []
         # loop through every file and calculate data for a boxplot
         for metric in metrics:
+            data = []
             print(f"Evaluating the {metric} metric\n")
-            data.append([metric])
             data.append(['Analysis date', 'Min', 'Q1', 'Median', 'Q3', 'Max', 'Mean'])
             for file_path in self.analysis_results_files:
                 df = pd.read_csv(file_path)
 
                 #Exclude KG with SPARQL endpoint offline or not indicated
-                df = df[(df["Sparql endpoint"] == "Available")]
+                if(only_sparql_up == True):
+                    df = df[(df["Sparql endpoint"] == "Available")]
 
                 df[metric] = pd.to_numeric(df[metric], errors='coerce')
                 min_value = df[metric].min()
@@ -86,10 +111,85 @@ def stats_over_time(self, metrics):
                 evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
                 data.append(evaluation)
 
-        with open(f'{self.output_file}.csv', mode='a', newline='') as file:
-            writer = csv.writer(file)
-            writer.writerows(data)
+            here = os.path.dirname(os.path.abspath(__file__))
+            save_path = os.path.join(here,f'./evaluation_results/over_time/{metric}.csv')
+            with open(save_path, mode='w', newline='') as file:
+                writer = csv.writer(file)
+                writer.writerows(data)
 
+    def convert_to_category(self,only_sparql_up = True):
+        '''
+            Creates a quality score divided by category of dimension, taking the average score obtained from the dimension for each measurement.
+
+            _param results_path: path to the folder in which there are all the CSV file that containing the quality evaluation data.
+        '''
+        evaluation_results = []
+        category = {
+            "Intrinsic" : {
+                "Accuracy score" : 0,
+                "Interlinking score" : 0,
+                "Consistency score" : 0,
+                "Conciseness score" : 0,
+            },
+            "Datasey dynamicity" : {
+                "Currency score" : 0,
+                "Volatility score" : 0,
+            },
+            "Trust" : {
+                "Verifiability score" : 0,
+                "Reputation score" : 0,
+                "Believability score" : 0,
+            },
+            "Contextual" : {
+                "Completeness score" : 0,
+                "Amount of data score" : 0,
+            },
+            "Representational" : {
+                "Representational-Consistency score": 0,
+                "Representational-Conciseness score" : 0,
+                "Understandability score" : 0,
+                "Interpretability score" : 0,
+                "Versatility score" : 0
+            },
+            "Accessibility": {
+                "Availability score" : 0,
+                "Licensing score" : 0,
+                "Security score" : 0,
+                "Performance score" : 0,
+            }
+        }
+
+        for key in category:
+            print(f"Evaluating the {key} category")
+            data = []
+            data.append(['Analysis date', 'Mean score'])
+            for file_path in self.analysis_results_files:
+                df = pd.read_csv(file_path)
+                for dimension in category[key]:
+
+                    if(only_sparql_up == True):
+                        df = df[(df["Sparql endpoint"] == "Available")]
+
+                    df[dimension] = pd.to_numeric(df[dimension], errors='coerce')
+                    mean_value = df[dimension].mean()
+
+                    category[key][dimension] = mean_value
+
+                values_in_category = []
+                for dimension in category[key]:
+                    mean_score = category[key][dimension]
+                    values_in_category.append(mean_score)
+                    category_score = sum(values_in_category) / len(values_in_category) 
+
+                evaluation = [os.path.basename(file_path).split('.')[0], category_score]
+                data.append(evaluation)
+
+            here = os.path.dirname(os.path.abspath(__file__))
+            save_path = os.path.join(here,f'./evaluation_results/over_time/by_category/{key}.csv')
+            with open(save_path, mode='w', newline='') as file:
+                writer = csv.writer(file)
+                writer.writerows(data)
+
     def evaluate_provenance_info(self):
         '''
             Evaluate the provenance metrics by checking if an author or a publisher is indicated in the KG.
@@ -111,14 +211,16 @@ def evaluate_provenance_info(self):
 
             evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
             data.append(evaluation)
-
-        with open(f'{self.output_file}.csv', mode='a', newline='') as file:
+
+        here = os.path.dirname(os.path.abspath(__file__))
+        save_path = os.path.join(here,f'./evaluation_results/{self.output_file}.csv')
+        with open(save_path, mode='a', newline='') as file:
             writer = csv.writer(file)
             writer.writerows(data)
 
     def evaluate_integer_metrics(self,metric,new_column_name):
         '''
-            Evaluates the quality of metrics that have integers as their value.
+            Evaluates the quality of metrics that have list as their value.
             
             :param metric the metric name to evaluate.
             :param new_column_name the column name in which insert the number of elements in the measured meatric.
@@ -147,7 +249,9 @@ def evaluate_integer_metrics(self,metric,new_column_name):
             evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
             data.append(evaluation)
 
-        with open(f'{self.output_file}.csv', mode='a', newline='') as file:
+        here = os.path.dirname(os.path.abspath(__file__))
+        save_path = os.path.join(here,f'./evaluation_results/{self.output_file}.csv')
+        with open(save_path, mode='a', newline='') as file:
             writer = csv.writer(file)
             writer.writerows(data)
 
@@ -174,16 +278,15 @@ def evaluate_conciseness(self):
 
             evaluation = [os.path.basename(file_path).split('.')[0],min_value, q1_value, median_value, q3_value, max_value, mean_value]
             data.append(evaluation)
-
-        with open(f'{self.output_file}.csv', mode='a', newline='') as file:
+
+        here = os.path.dirname(os.path.abspath(__file__))
+        save_path = os.path.join(here,f'./evaluation_results/{self.output_file}.csv')
+        with open(save_path, mode='a', newline='') as file:
             writer = csv.writer(file)
             writer.writerows(data)
 
-
-
-
-
 q = QualityEvaluationOT('./quality_data','quality_evaluation_over_time')
-#q.stats_over_time(['Deprecated classes/properties used','Invalid usage of inverse-functional properties','','Entities as member of disjoint class'])
-#q.evaluate_provenance_info()
-q.evaluate_conciseness()
+#q.stats_over_time(['Availability score','Licensing score','Interlinking score','Performance score','Accuracy score','Consistency score','Conciseness score',
+#                   'Verifiability score','Reputation score','Believability score','Currency score','Volatility score','Completeness score','Amount of data score','Representational-Consistency score','Representational-Conciseness score',
+#                   'Understandability score','Interpretability score','Versatility score','Security score'])
+q.convert_to_category()