Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch the EFO names, to check that they match the given EFO IDs and i… #296

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 50 additions & 11 deletions curation/imports/curation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import pandas as pd
from curation.imports.study import StudyImport
from curation.imports.scoring_file import ScoringFileUpdate
from curation_tracker.models import CurationPublicationAnnotation


curation_tracker = 'curation_tracker'

class CurationImport():
'''
Class responsible to import study metadata from a list of spreadsheet files.
Expand All @@ -14,7 +17,7 @@ class CurationImport():

failed_studies = {}

def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles):
def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker):
self.curation2schema = pd.read_excel(data_path['template_schema'], sheet_name='Curation', index_col=0)
self.curation2schema_scoring = pd.read_excel(data_path['scoring_schema'], sheet_name='Columns', index_col=0)

Expand All @@ -24,12 +27,16 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf
self.new_scoring_path = data_path['scoring_dir']
self.scoringfiles_format_version = scoringfiles_format_version
self.skip_scoringfiles = skip_scoringfiles
self.skip_curationtracker = skip_curationtracker

self.curation_status_by_default = curation_status_by_default

self.steps_count = 2
self.step = 1
self.steps_total = 2
if self.skip_scoringfiles == False:
self.steps_count = 3
self.steps_total = self.steps_total + 1
if self.skip_curationtracker == False:
self.steps_total = self.steps_total + 1


def global_report(self):
Expand All @@ -48,6 +55,13 @@ def global_report(self):
print('\n')


def print_step(self, step_name:str):
''' Print the step number and title '''
if self.step > 1:
print('\n----------------------------------\n')
print(f"::::: Step {self.step}/{self.steps_total}: {step_name} :::::\n")


def run_curation_import(self):
'''
Method to run the curation import processes for each study:
Expand All @@ -61,31 +75,56 @@ def run_curation_import(self):
## Parsing ##
study_import = StudyImport(study_data, self.studies_path, self.curation2schema, self.curation_status_by_default)
study_import.print_title()
print(f'==> Step 1/{self.steps_count}: Parsing study data')
self.print_step('Parsing study data')
study_import.parse_curation_data()
if study_import.has_failed:
self.failed_studies[study_import.study_name] = 'import error'
continue

## Import ##
print('\n----------------------------------\n')
print(f'==> Step 2/{self.steps_count}: Importing study data')
self.step += 1
self.print_step('Importing study data')
study_import.import_curation_data()
if study_import.has_failed:
self.failed_studies[study_import.study_name] = 'import error'
continue

## Scoring files ##
if self.skip_scoringfiles == False:
print('\n----------------------------------\n')
print(f'==> Step 3/{self.steps_count}: Add header to the Scoring file(s)')
self.step += 1
self.print_step('Add header to the Scoring file(s)')
if study_import.study_scores:
for score_id, score in study_import.study_scores.items():
scoring_file_update = ScoringFileUpdate(score, study_import.study_path, self.new_scoring_path, self.curation2schema_scoring, self.scoringfiles_format_version)
is_failed = scoring_file_update.update_scoring_file()
if is_failed == True:
self.failed_studies[study_import.study_name] = 'scoring file error'
continue
else:
print(" > No scores for this study, therefore no scoring files")
if study_import.study_name in self.failed_studies.keys():
continue

## Update Curation Tracker ##
if self.skip_curationtracker == False:
self.step += 1
self.print_step('Update the study status in the Curation Tracker')
curation_pub = None
if study_import.study_publication.doi:
try:
curation_pub = CurationPublicationAnnotation.objects.using(curation_tracker).get(doi=study_import.study_publication.doi)
print(" > Study found using the publication DOI")
except CurationPublicationAnnotation.DoesNotExist:
print(" ! Study NOT found using the publication DOI")

if curation_pub == None:
try:
curation_pub = CurationPublicationAnnotation.objects.using(curation_tracker).get(study_name=study_import.study_name)
print(" > Study found in Curation Tracker, using the study name")
except CurationPublicationAnnotation.DoesNotExist:
print(" > Can't find/retrieve the study in the Curation Tracker to update its status")
self.failed_studies[study_import.study_name] = 'curation tracker error'

if curation_pub != None:
curation_pub.curation_status = 'Imported - Awaiting Release'
curation_pub.save()
print(" > Curation status updated in the Curation Tracker")

self.global_report()
57 changes: 36 additions & 21 deletions curation/imports/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,14 @@ def import_curation_data(self):
'''
self.import_publication_model()
self.import_score_models()
self.import_gwas_dev_samples()
self.remove_existing_performance_metrics()
self.import_samplesets()
self.import_performance_metrics()
if len(self.failed_data_import) == 0:
self.import_gwas_dev_samples()
if len(self.failed_data_import) == 0:
self.remove_existing_performance_metrics()
if len(self.failed_data_import) == 0:
self.import_samplesets()
if len(self.failed_data_import) == 0:
self.import_performance_metrics()

# Print import warnings
if len(self.import_warnings):
Expand All @@ -108,7 +112,9 @@ def import_curation_data(self):
# Remove entries if the import failed
if len(self.failed_data_import):
self.has_failed = True
print('\n**** ERROR: Import failed! ****')
print('\n*******************************')
print('**** ERROR: Import failed! ****')
print('*******************************')
print(' - '+'\n - '.join(self.failed_data_import))
for obj in self.data_obj:
ids = self.data_ids[obj[0]]
Expand Down Expand Up @@ -136,17 +142,28 @@ def import_publication_model(self):
def import_score_models(self):
''' Import the Score data if the Score is not yet in the database. '''
print('> Import Scores')
for score_id, score_data in self.study.parsed_scores.items():
# Check if Score model already exists
try:
score = Score.objects.get(name=score_data.data['name'],publication__id=self.study_publication.id)
self.import_warnings.append(f'Existing Score: {score.id} ({score_id})')
self.existing_scores.append(score.id)
# Create Score model
except Score.DoesNotExist:
score = score_data.create_score_model(self.study_publication)
self.import_warnings.append(f'New Score: {score.id} ({score_id})')
self.study_scores[score_id] = score
try:
for score_id, score_data in self.study.parsed_scores.items():
# Check if Score model already exists
print(f">>> ID: {score_id} | NAME: {score_data.data['name']} | PUB ID: {self.study_publication.id}")
score_type = 'New'
try:
score = Score.objects.get(name=score_data.data['name'],publication__id=self.study_publication.id)
if score:
score_type = 'Existing'
self.existing_scores.append(score.id)
# Create Score model
except Score.DoesNotExist:
score = score_data.create_score_model(self.study_publication)

if score:
self.import_warnings.append(f'{score_type} Score: {score.id} ({score_id})')
self.study_scores[score_id] = score
else:
self.failed_data_import.append(f"Score: {score_id} couldn't be imported: \n{score_data.display_import_report_errors()}")
return
except Exception as e:
self.failed_data_import.append(f'Score(s): {e}')


def import_gwas_dev_samples(self):
Expand Down Expand Up @@ -289,7 +306,7 @@ def import_performance_metrics(self):
current_score = Score.objects.get(id__iexact=i[0])
except Score.DoesNotExist:
self.failed_data_import.append(f'Performance Metric: can\'t find the Score {i[0]} in the database')
continue
break

related_SampleSet = self.study_samplesets[i[1]]

Expand All @@ -303,10 +320,8 @@ def import_performance_metrics(self):
self.data_ids['performance'].append(study_performance.num)
else:
self.import_warnings.append(f'Performance Metric not created because of an issue while creating it.')
if 'error' in performance.report['import']:
msg = ', '.join(performance.report['import']['error'])
self.failed_data_import.append(f'Performance Metric: {msg}')
continue
self.failed_data_import.append(', '.join(performance.display_import_report_errors()))
break
# Add the performance metrics to the list
metric_models.extend(performance.metric_models)

Expand Down
19 changes: 16 additions & 3 deletions curation/parsers/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def replace_non_ascii_chars(self,field,value):
def add_parsing_report(self, rtype, msg):
"""
Store the reported error/warning.
- rtype: type of report (e.g. error, warning)
- rtype: type of report (e.g. error, warning, import)
- msg: error message
"""
if rtype in self.report_types:
Expand Down Expand Up @@ -83,9 +83,22 @@ def parsing_report_warning(self, msg):
self.add_parsing_report('warning', msg)


def parsing_report_error_import(self, msg):
def import_report_error(self, msg):
"""
Store the reported import error.
- msg: import error message
"""
self.add_parsing_report('import', 'error', msg)
self.add_parsing_report('import', msg)


def display_import_report_errors(self, display_spreadsheet_info:bool=True):
""" Return the content of the import error reports """
report_msg = []
type = 'import'
for sp_name, messages in self.report[type].items():
prefix = ''
if display_spreadsheet_info:
prefix = f' > {sp_name}: '
for message in list(messages):
report_msg.append(f"{prefix}{message}")
return '\n'.join(report_msg)
3 changes: 1 addition & 2 deletions curation/parsers/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def create_performance_model(self, publication, score, sampleset):

except IntegrityError as e:
self.model = None
self.parsing_report_error_import(e)
print('Error with the creation of the Performance(s) and/or the Metric(s)')
self.import_report_error(f'Error with the creation of the Performance(s) and/or the Metric(s): {e}')

return self.model
24 changes: 17 additions & 7 deletions curation/parsers/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,29 @@ def create_score_model(self,publication):
with transaction.atomic():
self.model = Score()
self.model.set_score_ids(self.next_id_number(Score))
trait_ids = []
trait_names = []
for field, val in self.data.items():
if field == 'trait_efo':
efo_traits = []
for trait_id in val:
trait_id = trait_id.replace(':','_').strip()
trait = TraitData(trait_id, self.spreadsheet_name)
efo = trait.efotrait_model()
efo_traits.append(efo)
trait_ids = [x.replace(':','_').strip() for x in val]
elif field == 'trait_efo_name':
trait_names = [x.replace(':','_').strip() for x in val]
else:
if field == 'method_name':
if val in self.method_name_replacement.keys():
val = self.method_name_replacement[val]
setattr(self.model, field, val)
# Traits
trait_ids_names = zip(trait_ids, trait_names)
efo_traits = []
for trait_id, trait_name in trait_ids_names:
trait = TraitData(trait_id, trait_name, self.spreadsheet_name)
efo = trait.efotrait_model()
if efo:
efo_traits.append(efo)
else:
self.import_report_error(f"Can't create the EFO model {trait_id} ({trait_name}): {trait.display_import_report_errors(False)}")
return None
# Associate a Publication
self.model.publication = publication
self.model.save()
Expand All @@ -53,5 +63,5 @@ def create_score_model(self,publication):
self.model.save()
except IntegrityError as e:
self.model = None
print('Error with the creation of the Score(s) and/or the Trait(s)')
self.import_report_error('Error with the creation of the Score(s) and/or the Trait(s)')
return self.model
15 changes: 12 additions & 3 deletions curation/parsers/trait.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

class TraitData(GenericData):

def __init__(self,trait_id,spreadsheet_name):
def __init__(self,trait_id,trait_name,spreadsheet_name):
GenericData.__init__(self,spreadsheet_name)
self.trait_id = trait_id
self.trait_name = trait_name


def efotrait_model(self):
Expand All @@ -17,6 +18,9 @@ def efotrait_model(self):
'''
try:
self.model = EFOTrait.objects.get(id__iexact=self.trait_id)
if self.model.label.lower() != self.trait_name.lower():
self.import_report_error(f"The given trait name for '{self.trait_id}' ({self.trait_name}) is different from the one provided by EFO ({self.model.label})")
self.model = None
except EFOTrait.DoesNotExist:
self.create_efotrait_model()
return self.model
Expand All @@ -28,12 +32,17 @@ def create_efotrait_model(self):
Create an instance of the EFOTrait model.
Return type: EFOTrait model
'''
self.import_report_error(f"Test for {self.trait_id}")
try:
with transaction.atomic():
self.model = EFOTrait(id=self.trait_id)
self.model.parse_api()
self.model.save()
if self.model.label.lower() == self.trait_name.lower():
self.model.save()
else:
self.import_report_error(f"The given trait name for '{self.trait_id}' ({self.trait_name}) is different from the one provided by EFO ({self.model.label})")
self.model = None
except IntegrityError as e:
self.model = None
print('Error with the creation of the EFOTrait model')
self.import_report_error('Error with the creation of the EFOTrait model')
return self.model
37 changes: 20 additions & 17 deletions curation/template_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,23 +170,26 @@ def extract_scores(self, license=None):
# Loop throught the rows (i.e. score)
for score_name, score_info in self.table_scores.iterrows():
parsed_score = ScoreData(score_name,spreadsheet_name)
if license:
parsed_score.add_data('license', license)
# Loop throught the columns
for col, val in score_info.items():
if pd.isnull(val) is False:
# Map to schema
m, f = self.get_model_field_from_schema(col,current_schema)

# Add to ScoreData if it's from the Score model
if m == model:
if f == 'trait_efo':
efo_list = val.split(',')
parsed_score.add_data(f, efo_list)
else:
parsed_score.add_data(f, val)
self.update_report(parsed_score)
self.parsed_scores[score_name] = parsed_score
if parsed_score:
if license:
parsed_score.add_data('license', license)
# Loop throught the columns
for col, val in score_info.items():
if pd.isnull(val) is False:
# Map to schema
m, f = self.get_model_field_from_schema(col,current_schema)

# Add to ScoreData if it's from the Score model
if m == model:
if f in ['trait_efo','trait_efo_name']:
efo_list = val.split(',')
parsed_score.add_data(f, efo_list)
else:
parsed_score.add_data(f, val)
self.update_report(parsed_score)
self.parsed_scores[score_name] = parsed_score
else:
self.report_error(spreadsheet_name, f"Can't parse the Score '{score_name}'!")


def extract_samples(self):
Expand Down
Binary file modified curation/templates/TemplateColumns2Models.xlsx
Binary file not shown.