PGScatalog · ens-lgil · Nov 7, 2023 · Nov 7, 2023
diff --git a/curation/imports/curation.py b/curation/imports/curation.py
@@ -1,8 +1,11 @@
 import pandas as pd
 from curation.imports.study import StudyImport
 from curation.imports.scoring_file import ScoringFileUpdate
+from curation_tracker.models import CurationPublicationAnnotation
 
 
+curation_tracker = 'curation_tracker'
+
 class CurationImport():
     '''
     Class responsible to import study metadata from a list of spreadsheet files.
@@ -14,7 +17,7 @@ class CurationImport():
 
     failed_studies = {}
 
-    def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles):
+    def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker):
         self.curation2schema = pd.read_excel(data_path['template_schema'], sheet_name='Curation', index_col=0)
         self.curation2schema_scoring = pd.read_excel(data_path['scoring_schema'], sheet_name='Columns', index_col=0)
 
@@ -24,12 +27,16 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf
         self.new_scoring_path = data_path['scoring_dir']
         self.scoringfiles_format_version = scoringfiles_format_version
         self.skip_scoringfiles = skip_scoringfiles
+        self.skip_curationtracker = skip_curationtracker
 
         self.curation_status_by_default = curation_status_by_default
 
-        self.steps_count = 2
+        self.step = 1
+        self.steps_total = 2
         if self.skip_scoringfiles == False:
-            self.steps_count = 3
+            self.steps_total = self.steps_total + 1
+        if self.skip_curationtracker == False:
+            self.steps_total = self.steps_total + 1
 
 
     def global_report(self):
@@ -48,6 +55,13 @@ def global_report(self):
         print('\n')
 
 
+    def print_step(self, step_name:str):
+        ''' Print the step number and title '''
+        if self.step > 1:
+            print('\n----------------------------------\n')
+        print(f"::::: Step {self.step}/{self.steps_total}: {step_name} :::::\n")
+
+
     def run_curation_import(self):
         '''
         Method to run the curation import processes for each study:
@@ -61,31 +75,56 @@ def run_curation_import(self):
             ## Parsing ##
             study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default)
             study_import.print_title()
-            print(f'==> Step 1/{self.steps_count}: Parsing study data')
+            self.print_step('Parsing study data')
             study_import.parse_curation_data()
-            if study_import.has_failed:
-                self.failed_studies[study_import.study_name] = 'import error'
-                continue
 
             ## Import ##
-            print('\n----------------------------------\n')
-            print(f'==> Step 2/{self.steps_count}: Importing study data')
+            self.step += 1
+            self.print_step('Importing study data')
             study_import.import_curation_data()
             if study_import.has_failed:
                 self.failed_studies[study_import.study_name] = 'import error'
                 continue
 
             ## Scoring files ##
             if self.skip_scoringfiles == False:
-                print('\n----------------------------------\n')
-                print(f'==> Step 3/{self.steps_count}: Add header to the Scoring file(s)')
+                self.step += 1
+                self.print_step('Add header to the Scoring file(s)')
                 if study_import.study_scores:
                     for score_id, score in study_import.study_scores.items():
                         scoring_file_update = ScoringFileUpdate(score, study_import.study_path, self.new_scoring_path, self.curation2schema_scoring, self.scoringfiles_format_version)
                         is_failed = scoring_file_update.update_scoring_file()
                         if is_failed == True:
                             self.failed_studies[study_import.study_name] = 'scoring file error'
+                            continue
                 else:
                     print("  > No scores for this study, therefore no scoring files")
+                if study_import.study_name in self.failed_studies.keys():
+                    continue
+
+            ## Update Curation Tracker ##
+            if self.skip_curationtracker == False:
+                self.step += 1
+                self.print_step('Update the study status in the Curation Tracker')
+                curation_pub = None
+                if study_import.study_publication.doi:
+                    try:
+                        curation_pub = CurationPublicationAnnotation.objects.using(curation_tracker).get(doi=study_import.study_publication.doi)
+                        print("  > Study found using the publication DOI")
+                    except CurationPublicationAnnotation.DoesNotExist:
+                        print("  ! Study NOT found using the publication DOI")
+
+                if curation_pub == None:
+                    try:
+                        curation_pub = CurationPublicationAnnotation.objects.using(curation_tracker).get(study_name=study_import.study_name)
+                        print("  > Study found in Curation Tracker, using the study name")
+                    except CurationPublicationAnnotation.DoesNotExist:
+                        print("  > Can't find/retrieve the study in the Curation Tracker to update its status")
+                        self.failed_studies[study_import.study_name] = 'curation tracker error'
+
+                if curation_pub != None:
+                    curation_pub.curation_status = 'Imported - Awaiting Release'
+                    curation_pub.save()
+                    print("  > Curation status updated in the Curation Tracker")
 
         self.global_report()
diff --git a/curation/imports/study.py b/curation/imports/study.py
@@ -95,10 +95,14 @@ def import_curation_data(self):
         '''
         self.import_publication_model()
         self.import_score_models()
-        self.import_gwas_dev_samples()
-        self.remove_existing_performance_metrics()
-        self.import_samplesets()
-        self.import_performance_metrics()
+        if len(self.failed_data_import) == 0:
+            self.import_gwas_dev_samples()
+        if len(self.failed_data_import) == 0:
+            self.remove_existing_performance_metrics()
+        if len(self.failed_data_import) == 0:
+            self.import_samplesets()
+        if len(self.failed_data_import) == 0:
+            self.import_performance_metrics()
 
         # Print import warnings
         if len(self.import_warnings):
@@ -108,7 +112,9 @@ def import_curation_data(self):
         # Remove entries if the import failed
         if len(self.failed_data_import):
             self.has_failed = True
-            print('\n**** ERROR: Import failed! ****')
+            print('\n*******************************')
+            print('**** ERROR: Import failed! ****')
+            print('*******************************')
             print('  - '+'\n  - '.join(self.failed_data_import))
             for obj in self.data_obj:
                 ids = self.data_ids[obj[0]]
@@ -136,17 +142,28 @@ def import_publication_model(self):
     def import_score_models(self):
         ''' Import the Score data if the Score is not yet in the database. '''
         print('> Import Scores')
-        for score_id, score_data in self.study.parsed_scores.items():
-            # Check if Score model already exists
-            try:
-                score = Score.objects.get(name=score_data.data['name'],publication__id=self.study_publication.id)
-                self.import_warnings.append(f'Existing Score: {score.id} ({score_id})')
-                self.existing_scores.append(score.id)
-            # Create Score model
-            except Score.DoesNotExist:
-                score = score_data.create_score_model(self.study_publication)
-                self.import_warnings.append(f'New Score: {score.id} ({score_id})')
-            self.study_scores[score_id] = score
+        try:
+            for score_id, score_data in self.study.parsed_scores.items():
+                # Check if Score model already exists
+                print(f">>> ID: {score_id} | NAME: {score_data.data['name']} | PUB ID: {self.study_publication.id}")
+                score_type = 'New'
+                try:
+                    score = Score.objects.get(name=score_data.data['name'],publication__id=self.study_publication.id)
+                    if score:
+                        score_type = 'Existing'
+                    self.existing_scores.append(score.id)
+                # Create Score model
+                except Score.DoesNotExist:
+                    score = score_data.create_score_model(self.study_publication)
+
+                if score:
+                    self.import_warnings.append(f'{score_type} Score: {score.id} ({score_id})')
+                    self.study_scores[score_id] = score
+                else:
+                    self.failed_data_import.append(f"Score: {score_id} couldn't be imported: \n{score_data.display_import_report_errors()}")
+                    return
+        except Exception as e:
+            self.failed_data_import.append(f'Score(s): {e}')
 
 
     def import_gwas_dev_samples(self):
@@ -289,7 +306,7 @@ def import_performance_metrics(self):
                         current_score = Score.objects.get(id__iexact=i[0])
                     except Score.DoesNotExist:
                         self.failed_data_import.append(f'Performance Metric: can\'t find the Score {i[0]} in the database')
-                        continue
+                        break
 
                 related_SampleSet = self.study_samplesets[i[1]]
 
@@ -303,10 +320,8 @@ def import_performance_metrics(self):
                     self.data_ids['performance'].append(study_performance.num)
                 else:
                     self.import_warnings.append(f'Performance Metric not created because of an issue while creating it.')
-                    if 'error' in performance.report['import']:
-                        msg = ', '.join(performance.report['import']['error'])
-                        self.failed_data_import.append(f'Performance Metric: {msg}')
-                    continue
+                    self.failed_data_import.append(', '.join(performance.display_import_report_errors()))
+                    break
                 # Add the performance metrics to the list
                 metric_models.extend(performance.metric_models)
 

diff --git a/curation/parsers/generic.py b/curation/parsers/generic.py
@@ -55,7 +55,7 @@ def replace_non_ascii_chars(self,field,value):
     def add_parsing_report(self, rtype, msg):
         """
         Store the reported error/warning.
-        - rtype: type of report (e.g. error, warning)
+        - rtype: type of report (e.g. error, warning, import)
         - msg: error message
         """
         if rtype in self.report_types:
@@ -83,9 +83,22 @@ def parsing_report_warning(self, msg):
         self.add_parsing_report('warning', msg)
 
 
-    def parsing_report_error_import(self, msg):
+    def import_report_error(self, msg):
         """
         Store the reported import error.
         - msg: import error message
         """
-        self.add_parsing_report('import', 'error', msg)
+        self.add_parsing_report('import', msg)
+
+
+    def display_import_report_errors(self, display_spreadsheet_info:bool=True):
+        """ Return the content of the import error reports """
+        report_msg = []
+        type = 'import'
+        for sp_name, messages in self.report[type].items():
+            prefix = ''
+            if display_spreadsheet_info:
+                prefix = f'    > {sp_name}: '
+            for message in list(messages):
+                report_msg.append(f"{prefix}{message}")
+        return '\n'.join(report_msg)
diff --git a/curation/parsers/performance.py b/curation/parsers/performance.py
@@ -144,7 +144,6 @@ def create_performance_model(self, publication, score, sampleset):
 
         except IntegrityError as e:
             self.model = None
-            self.parsing_report_error_import(e)
-            print('Error with the creation of the Performance(s) and/or the Metric(s)')
+            self.import_report_error(f'Error with the creation of the Performance(s) and/or the Metric(s): {e}')
 
         return self.model
diff --git a/curation/parsers/score.py b/curation/parsers/score.py
@@ -31,19 +31,29 @@ def create_score_model(self,publication):
             with transaction.atomic():
                 self.model = Score()
                 self.model.set_score_ids(self.next_id_number(Score))
+                trait_ids = []
+                trait_names = []
                 for field, val in self.data.items():
                     if field == 'trait_efo':
-                        efo_traits = []
-                        for trait_id in val:
-                            trait_id = trait_id.replace(':','_').strip()
-                            trait = TraitData(trait_id, self.spreadsheet_name)
-                            efo = trait.efotrait_model()
-                            efo_traits.append(efo)
+                        trait_ids = [x.replace(':','_').strip() for x in val]
+                    elif field == 'trait_efo_name':
+                        trait_names = [x.replace(':','_').strip() for x in val]
                     else:
                         if field == 'method_name':
                             if val in self.method_name_replacement.keys():
                                 val = self.method_name_replacement[val]
                         setattr(self.model, field, val)
+                # Traits
+                trait_ids_names = zip(trait_ids, trait_names)
+                efo_traits = []
+                for trait_id, trait_name in trait_ids_names:
+                        trait = TraitData(trait_id, trait_name, self.spreadsheet_name)
+                        efo = trait.efotrait_model()
+                        if efo:
+                            efo_traits.append(efo)
+                        else:
+                            self.import_report_error(f"Can't create the EFO model {trait_id} ({trait_name}): {trait.display_import_report_errors(False)}")
+                            return None
                 # Associate a Publication
                 self.model.publication = publication
                 self.model.save()
@@ -53,5 +63,5 @@ def create_score_model(self,publication):
                 self.model.save()
         except IntegrityError as e:
             self.model = None
-            print('Error with the creation of the Score(s) and/or the Trait(s)')
+            self.import_report_error('Error with the creation of the Score(s) and/or the Trait(s)')
         return self.model
diff --git a/curation/parsers/trait.py b/curation/parsers/trait.py
@@ -5,9 +5,10 @@
 
 class TraitData(GenericData):
 
-    def __init__(self,trait_id,spreadsheet_name):
+    def __init__(self,trait_id,trait_name,spreadsheet_name):
         GenericData.__init__(self,spreadsheet_name)
         self.trait_id = trait_id
+        self.trait_name = trait_name
 
 
     def efotrait_model(self):
@@ -17,6 +18,9 @@ def efotrait_model(self):
         '''
         try:
             self.model = EFOTrait.objects.get(id__iexact=self.trait_id)
+            if self.model.label.lower() != self.trait_name.lower():
+                self.import_report_error(f"The given trait name for '{self.trait_id}' ({self.trait_name}) is different from the one provided by EFO ({self.model.label})")
+                self.model = None
         except EFOTrait.DoesNotExist:
             self.create_efotrait_model()
         return self.model
@@ -28,12 +32,17 @@ def create_efotrait_model(self):
         Create an instance of the EFOTrait model.
         Return type: EFOTrait model
         '''
+        self.import_report_error(f"Test for {self.trait_id}")
         try:
             with transaction.atomic():
                 self.model = EFOTrait(id=self.trait_id)
                 self.model.parse_api()
-                self.model.save()
+                if self.model.label.lower() == self.trait_name.lower():
+                    self.model.save()
+                else:
+                    self.import_report_error(f"The given trait name for '{self.trait_id}' ({self.trait_name}) is different from the one provided by EFO ({self.model.label})")
+                    self.model = None
         except IntegrityError as e:
             self.model = None
-            print('Error with the creation of the EFOTrait model')
+            self.import_report_error('Error with the creation of the EFOTrait model')
         return self.model
diff --git a/curation/template_parser.py b/curation/template_parser.py
@@ -170,23 +170,26 @@ def extract_scores(self, license=None):
         # Loop throught the rows (i.e. score)
         for score_name, score_info in self.table_scores.iterrows():
             parsed_score = ScoreData(score_name,spreadsheet_name)
-            if license:
-                parsed_score.add_data('license', license)
-            # Loop throught the columns
-            for col, val in score_info.items():
-                if pd.isnull(val) is False:
-                    # Map to schema
-                    m, f = self.get_model_field_from_schema(col,current_schema)
-
-                    # Add to ScoreData if it's from the Score model
-                    if m == model:
-                        if f == 'trait_efo':
-                            efo_list = val.split(',')
-                            parsed_score.add_data(f, efo_list)
-                        else:
-                            parsed_score.add_data(f, val)
-            self.update_report(parsed_score)
-            self.parsed_scores[score_name] = parsed_score
+            if parsed_score:
+                if license:
+                    parsed_score.add_data('license', license)
+                # Loop throught the columns
+                for col, val in score_info.items():
+                    if pd.isnull(val) is False:
+                        # Map to schema
+                        m, f = self.get_model_field_from_schema(col,current_schema)
+
+                        # Add to ScoreData if it's from the Score model
+                        if m == model:
+                            if f in ['trait_efo','trait_efo_name']:
+                                efo_list = val.split(',')
+                                parsed_score.add_data(f, efo_list)
+                            else:
+                                parsed_score.add_data(f, val)
+                self.update_report(parsed_score)
+                self.parsed_scores[score_name] = parsed_score
+            else:
+                self.report_error(spreadsheet_name, f"Can't parse the Score '{score_name}'!")
 
 
     def extract_samples(self):

diff --git a/curation/templates/TemplateColumns2Models.xlsx b/curation/templates/TemplateColumns2Models.xlsx