From 57ccb9dac583b50c78c79e35b86055acf352ee49 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 14 Oct 2024 10:20:04 +0100 Subject: [PATCH 01/12] New python/nextflow processing xref pipeline --- nextflow/config/xref.config | 68 +- nextflow/workflows/xrefDownload.nf | 312 +-- nextflow/workflows/xrefProcess.nf | 627 ++++++ scripts/xrefs/cleanup_and_split_source.pl | 4 +- scripts/xrefs/cleanup_source.pl | 2 +- scripts/xrefs/coordinate_mapper.pl | 531 +++++ scripts/xrefs/dump_ensembl.pl | 86 + scripts/xrefs/refseq_coordinate_parser.pl | 468 +++++ src/python/ensembl/common/Params.py | 436 ++-- src/python/ensembl/common/__init__.py | 15 + .../production/xrefs/AdvisoryXrefReport.py | 39 + .../ensembl/production/xrefs/Alignment.py | 91 + src/python/ensembl/production/xrefs/Base.py | 1780 +++++++++-------- .../ensembl/production/xrefs/Checksum.py | 54 +- .../production/xrefs/CoordinateMapping.py | 50 + .../ensembl/production/xrefs/DirectXrefs.py | 39 + .../production/xrefs/DownloadSource.py | 96 +- .../ensembl/production/xrefs/DumpEnsembl.py | 81 + .../ensembl/production/xrefs/DumpXref.py | 135 ++ .../xrefs/EmailAdvisoryXrefReport.py | 100 + .../production/xrefs/EmailNotification.py | 390 +++- .../ensembl/production/xrefs/Mapping.py | 91 + .../ensembl/production/xrefs/ParseSource.py | 90 + .../production/xrefs/ProcessAlignment.py | 37 + .../production/xrefs/RNACentralMapping.py | 62 + .../production/xrefs/ScheduleAlignment.py | 73 + .../production/xrefs/ScheduleCleanup.py | 82 +- .../production/xrefs/ScheduleDownload.py | 90 +- .../production/xrefs/ScheduleMapping.py | 56 + .../ensembl/production/xrefs/ScheduleParse.py | 219 ++ .../production/xrefs/ScheduleSpecies.py | 178 ++ .../production/xrefs/UniParcMapping.py | 62 + .../ensembl/production/xrefs/__init__.py | 15 + .../xrefs/config/xref_all_sources.json | 58 +- .../production/xrefs/config/xref_config.ini | 35 +- .../production/xrefs/mappers/BasicMapper.py | 432 ++++ .../xrefs/mappers/ChecksumMapper.py | 111 + .../xrefs/mappers/CoordinateMapper.py | 130 ++ .../production/xrefs/mappers/CoreInfo.py | 320 +++ .../xrefs/mappers/DirectXrefsMapper.py | 182 ++ .../production/xrefs/mappers/DisplayXrefs.py | 871 ++++++++ .../xrefs/mappers/OfficialNaming.py | 637 ++++++ .../xrefs/mappers/ProcessMappings.py | 382 ++++ .../production/xrefs/mappers/ProcessMoves.py | 478 +++++ .../production/xrefs/mappers/ProcessPaired.py | 248 +++ .../xrefs/mappers/ProcessPriorities.py | 408 ++++ .../xrefs/mappers/RNACentralMapper.py | 28 + .../production/xrefs/mappers/TestMappings.py | 199 ++ .../production/xrefs/mappers/UniParcMapper.py | 28 + .../production/xrefs/mappers/XrefLoader.py | 804 ++++++++ .../production/xrefs/mappers/__init__.py | 15 + .../xrefs/mappers/methods/ChecksumBasic.py | 91 + .../xrefs/mappers/methods/MySQLChecksum.py | 48 + .../xrefs/mappers/methods/__init__.py | 15 + .../xrefs/mappers/species/__init__.py | 15 + .../xrefs/mappers/species/aedes_aegypti.py | 39 + .../mappers/species/anopheles_gambiae.py | 42 + .../mappers/species/culex_quinquefasciatus.py | 49 + .../xrefs/mappers/species/danio_rerio.py | 30 + .../xrefs/mappers/species/drosophila.py | 44 + .../xrefs/mappers/species/eukaryota.py | 277 +++ .../xrefs/mappers/species/homo_sapiens.py | 29 + .../mappers/species/ixodes_scapularis.py | 42 + .../xrefs/mappers/species/mus_musculus.py | 29 + .../mappers/species/neurospora_crassa.py | 33 + .../xrefs/mappers/species/parasite.py | 46 + .../mappers/species/rattus_norvegicus.py | 29 + .../species/saccharomyces_cerevisiae.py | 41 + .../xrefs/mappers/species/sars_cov_2.py | 131 ++ .../species/schizosaccharomyces_pombe.py | 41 + .../xrefs/mappers/species/sus_scrofa.py | 29 + .../xrefs/mappers/species/wormbase.py | 124 ++ .../xrefs/parsers/ArrayExpressParser.py | 161 ++ .../production/xrefs/parsers/BaseParser.py | 972 +++++++++ .../production/xrefs/parsers/CCDSParser.py | 101 + .../production/xrefs/parsers/DBASSParser.py | 114 ++ .../xrefs/parsers/EntrezGeneParser.py | 120 ++ .../production/xrefs/parsers/HGNCParser.py | 421 ++++ .../production/xrefs/parsers/HPAParser.py | 74 + .../xrefs/parsers/JGI_ProteinParser.py | 60 + .../production/xrefs/parsers/MGIParser.py | 72 + .../xrefs/parsers/MGI_CCDS_Parser.py | 107 + .../xrefs/parsers/MGI_Desc_Parser.py | 101 + .../production/xrefs/parsers/MIMParser.py | 159 ++ .../xrefs/parsers/Mim2GeneParser.py | 170 ++ .../production/xrefs/parsers/RFAMParser.py | 193 ++ .../production/xrefs/parsers/RGDParser.py | 154 ++ .../xrefs/parsers/ReactomeParser.py | 189 ++ .../xrefs/parsers/RefSeqCoordinateParser.py | 96 + .../xrefs/parsers/RefSeqGPFFParser.py | 341 ++++ .../production/xrefs/parsers/UCSCParser.py | 136 ++ .../production/xrefs/parsers/UniProtParser.py | 452 +++++ .../production/xrefs/parsers/VGNCParser.py | 93 + .../xrefs/parsers/XenopusJamboreeParser.py | 76 + .../xrefs/parsers/ZFINDescParser.py | 62 + .../production/xrefs/parsers/ZFINParser.py | 169 ++ .../production/xrefs/parsers/__init__.py | 15 + .../production/xrefs/parsers/miRBaseParser.py | 113 ++ 98 files changed, 16237 insertions(+), 1533 deletions(-) create mode 100644 nextflow/workflows/xrefProcess.nf create mode 100644 scripts/xrefs/coordinate_mapper.pl create mode 100644 scripts/xrefs/dump_ensembl.pl create mode 100644 scripts/xrefs/refseq_coordinate_parser.pl create mode 100644 src/python/ensembl/common/__init__.py create mode 100644 src/python/ensembl/production/xrefs/AdvisoryXrefReport.py create mode 100644 src/python/ensembl/production/xrefs/Alignment.py create mode 100644 src/python/ensembl/production/xrefs/CoordinateMapping.py create mode 100644 src/python/ensembl/production/xrefs/DirectXrefs.py create mode 100644 src/python/ensembl/production/xrefs/DumpEnsembl.py create mode 100644 src/python/ensembl/production/xrefs/DumpXref.py create mode 100644 src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py create mode 100644 src/python/ensembl/production/xrefs/Mapping.py create mode 100644 src/python/ensembl/production/xrefs/ParseSource.py create mode 100644 src/python/ensembl/production/xrefs/ProcessAlignment.py create mode 100644 src/python/ensembl/production/xrefs/RNACentralMapping.py create mode 100644 src/python/ensembl/production/xrefs/ScheduleAlignment.py create mode 100644 src/python/ensembl/production/xrefs/ScheduleMapping.py create mode 100644 src/python/ensembl/production/xrefs/ScheduleParse.py create mode 100644 src/python/ensembl/production/xrefs/ScheduleSpecies.py create mode 100644 src/python/ensembl/production/xrefs/UniParcMapping.py create mode 100644 src/python/ensembl/production/xrefs/__init__.py create mode 100644 src/python/ensembl/production/xrefs/mappers/BasicMapper.py create mode 100644 src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py create mode 100644 src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py create mode 100644 src/python/ensembl/production/xrefs/mappers/CoreInfo.py create mode 100644 src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py create mode 100644 src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py create mode 100644 src/python/ensembl/production/xrefs/mappers/OfficialNaming.py create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessMappings.py create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessMoves.py create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessPaired.py create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py create mode 100644 src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py create mode 100644 src/python/ensembl/production/xrefs/mappers/TestMappings.py create mode 100644 src/python/ensembl/production/xrefs/mappers/UniParcMapper.py create mode 100644 src/python/ensembl/production/xrefs/mappers/XrefLoader.py create mode 100644 src/python/ensembl/production/xrefs/mappers/__init__.py create mode 100644 src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py create mode 100644 src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py create mode 100644 src/python/ensembl/production/xrefs/mappers/methods/__init__.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/__init__.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/drosophila.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/eukaryota.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/parasite.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py create mode 100644 src/python/ensembl/production/xrefs/mappers/species/wormbase.py create mode 100644 src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/BaseParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/CCDSParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/DBASSParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/HGNCParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/HPAParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/MGIParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/MIMParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/RFAMParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/RGDParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/ReactomeParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/UCSCParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/UniProtParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/VGNCParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/ZFINParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/__init__.py create mode 100644 src/python/ensembl/production/xrefs/parsers/miRBaseParser.py diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config index 024f80e68..a7cef685e 100644 --- a/nextflow/config/xref.config +++ b/nextflow/config/xref.config @@ -36,50 +36,40 @@ report { } profiles { + slurm { + process { + errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } + executor = 'slurm' + queue = 'production' + queueSize = 300 + maxRetries = 2 + time = '1d' + memory = 100.MB - lsf { - process { - errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } - executor = 'lsf' - queue = 'production' - queueSize = 100 - maxRetries = 3 - withLabel:small_process { - memory = 200.MB - //very specific to lsf - executor.perTaskReserve = 200.MB - } - withLabel: dm { - queue = 'datamover' - time = '2h' - } - } - } + withLabel:small_process { + memory = 200.MB + } - slurm { - process { - errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } - executor = 'slurm' - queue = 'production' - queueSize = 100 - maxRetries = 3 - time = '1d' + withLabel: dm { + queue = 'datamover' + memory = 2.GB + } - withLabel:small_process { - memory = 200.MB - } + withLabel:mem1GB { + memory = 1.GB + } - withLabel: dm { - queue = 'datamover' - time = '3h' - memory = 2.GB - } - withLabel:mem4GB { - time = '5d' - memory = 4.GB + withLabel:mem4GB { + memory = 4.GB + } + + withLabel:align_mem { + errorStrategy = 'retry' + maxRetries = 5 + memory = { task.attempt <= 5 ? 4.GB * (task.attempt * task.attempt) : 16.GB } + } + } } - } - } } diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf index 65e255fda..e87458735 100644 --- a/nextflow/workflows/xrefDownload.nf +++ b/nextflow/workflows/xrefDownload.nf @@ -23,221 +23,221 @@ println """\ .stripIndent() def helpMessage() { - log.info""" - Usage: - nextflow run ensembl-production/xrefDownload.nf - --source_db_url (mandatory) Database URL to store information about xref sources. - Syntax: 'mysql://user:password@host:port/dbname' + log.info""" + Usage: + nextflow run ensembl-production/xrefDownload.nf + --source_db_url (mandatory) Database URL to store information about xref sources. + Syntax: 'mysql://user:password@host:port/dbname' - --base_path (mandatory) Path where log and source files will be stored, - a scratch space with sufficient storage is recommended. + --base_path (mandatory) Path where log and source files will be stored, + a scratch space with sufficient storage is recommended. - --reuse_db (optional) If set to 1, an existing source database (specified in --source_db_url) will be reused. - Default: 0 + --reuse_db (optional) If set to 1, an existing source database (specified in --source_db_url) will be reused. + Default: 0 - --skip_download (optional) If set to 1, source files will only be downloaded if they don't already exist in --base_path. - Default: 0 + --skip_download (optional) If set to 1, source files will only be downloaded if they don't already exist in --base_path. + Default: 0 - --skip_preparse (optional) If set to 1, the pre-parse step will be skipped (no central DB). - Default: 1 + --skip_preparse (optional) If set to 1, the pre-parse step will be skipped (no central DB). + Default: 1 - --clean_files (optional) If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files. - Default: 1 + --clean_files (optional) If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files. + Default: 1 - --split_files_by_species (optional) If set to 1, UniProt and RefSeq file will be split according to taxonomy ID. - Default: 1 + --split_files_by_species (optional) If set to 1, UniProt and RefSeq file will be split according to taxonomy ID. + Default: 1 - --config_file (optional) Path to the json file containing information about xref sources to download. - Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json + --config_file (optional) Path to the json file containing information about xref sources to download. + Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json - --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. - Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini + --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. + Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini - --clean_dir (optional) Path where to save the cleaned up files. - Default: [--base_path]/clean_files + --clean_dir (optional) Path where to save the cleaned up files. + Default: [--base_path]/clean_files - --tax_ids_file (optional) Path to the file containing the taxonomy IDs of the species to extract data for. - Used to update the data for the provided species. + --tax_ids_file (optional) Path to the file containing the taxonomy IDs of the species to extract data for. + Used to update the data for the provided species. - --update_mode (optional) If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs. - Only used if --tax_ids_file is set. Default: 0 - """.stripIndent() + --update_mode (optional) If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs. + Only used if --tax_ids_file is set. Default: 0 + """.stripIndent() } workflow { - if (params.help || !params.source_db_url || !params.base_path) { - helpMessage() - - if (!params.source_db_url) { - println """ - Missing required param source_db_url - """.stripIndent() - } - if (!params.base_path) { - println """ - Missing required param base_path - """.stripIndent() + if (params.help || !params.source_db_url || !params.base_path) { + helpMessage() + + if (!params.source_db_url) { + println """ + Missing required param source_db_url + """.stripIndent() + } + if (!params.base_path) { + println """ + Missing required param base_path + """.stripIndent() + } + + exit 1 } - exit 1 - } - - ScheduleDownload() - timestamp = ScheduleDownload.out[0] + ScheduleDownload() + timestamp = ScheduleDownload.out[0] - DownloadSource(ScheduleDownload.out[1].splitText(), timestamp) + DownloadSource(ScheduleDownload.out[1].splitText(), timestamp) - CleanupTmpFiles(DownloadSource.out.collect()) - ScheduleCleanup(CleanupTmpFiles.out, timestamp) + CleanupTmpFiles(DownloadSource.out.collect()) + ScheduleCleanup(CleanupTmpFiles.out, timestamp) - Checksum(ScheduleCleanup.out[0], timestamp) - if (params.split_files_by_species) { - CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) - NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp) - } else { - CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) - NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp) - } + Checksum(ScheduleCleanup.out[0], timestamp) + if (params.split_files_by_species) { + CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) + NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp) + } else { + CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) + NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp) + } } process ScheduleDownload { - label 'small_process' + label 'small_process' - output: - val timestamp - path 'dataflow_sources.json' + output: + val timestamp + path 'dataflow_sources.json' - script: - timestamp = new java.util.Date().format("yyyyMMdd_HHmmss") + script: + timestamp = new java.util.Date().format("yyyyMMdd_HHmmss") - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp - """ + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp + """ } process DownloadSource { - label 'dm' - tag "$src_name" + label 'dm' + tag "$src_name" - input: - val x - val timestamp + input: + val x + val timestamp - output: - val 'DownloadSourceDone' + output: + val 'DownloadSourceDone' - shell: - src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + shell: + src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} - """ + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} + """ } process CleanupTmpFiles { - label 'small_process' + label 'small_process' - input: - val x + input: + val x - output: - val 'TmpCleanupDone' + output: + val 'TmpCleanupDone' - """ - find ${params.base_path} -type f -name "*.tmp" -delete - """ + """ + find ${params.base_path} -type f -name "*.tmp" -delete + """ } process ScheduleCleanup { - label 'small_process' + label 'small_process' - input: - val x - val timestamp + input: + val x + val timestamp - output: - val 'ScheduleCleanupDone' - path 'dataflow_cleanup_sources.json' + output: + val 'ScheduleCleanupDone' + path 'dataflow_cleanup_sources.json' - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp - """ + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp + """ } process Checksum { - label 'default_process' + label 'default_process' - input: - val x - val timestamp + input: + val x + val timestamp - output: - val 'ChecksumDone' + output: + val 'ChecksumDone' - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp - """ + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp + """ } process CleanupSplitSource { - label 'mem4GB' - tag "$src_name" - - input: - each x - val timestamp - - output: - val 'CleanupDone' - - shell: - cmd_params = "" - src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] - if (x =~ /"version_file":/) { - version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] - cmd_params = "${cmd_params} --version_file '${version_file}'" - } - if (params.tax_ids_file) { - cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}" - } - - """ - perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params - """ + label 'mem4GB' + tag "$src_name" + + input: + each x + val timestamp + + output: + val 'CleanupDone' + + shell: + cmd_params = "" + src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + if (x =~ /"version_file":/) { + version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] + cmd_params = "${cmd_params} --version_file '${version_file}'" + } + if (params.tax_ids_file) { + cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}" + } + + """ + perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params + """ } process CleanupSource { - label 'mem4GB' - tag "$src_name" - - input: - val x - val timestamp - - output: - val 'CleanupDone' - - shell: - cmd_params = "" - src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] - if (x =~ /"version_file":/) { - version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] - cmd_params = "${cmd_params} --version_file '${version_file}'" - } - - """ - perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params - """ + label 'mem4GB' + tag "$src_name" + + input: + val x + val timestamp + + output: + val 'CleanupDone' + + shell: + cmd_params = "" + src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + if (x =~ /"version_file":/) { + version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] + cmd_params = "${cmd_params} --version_file '${version_file}'" + } + + """ + perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params + """ } process NotifyByEmail { - label 'small_process' + label 'small_process' - input: - val x - val timestamp + input: + val x + val timestamp - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp - """ + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp + """ } diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf new file mode 100644 index 000000000..02517aa60 --- /dev/null +++ b/nextflow/workflows/xrefProcess.nf @@ -0,0 +1,627 @@ +#!/usr/bin/env nextflow + +// Parameter default values +params.pipeline_name = 'Xref Process Pipeline' +params.help = false + +println """\ + XREF PROCESS PIPELINE + ====================== + release : ${params.release} + source_db_url : ${params.source_db_url} + base_path : ${params.base_path} + registry_url : ${params.registry_url} + xref_db_url : ${params.xref_db_url} + run_all : ${params.run_all} + species : ${params.species} + antispecies : ${params.antispecies} + division : ${params.division} + split_files_by_species : ${params.split_files_by_species} + sources_config_file : ${params.sources_config_file} + registry_file : ${params.registry_file} + dc_config_file : ${params.dc_config_file} + """ + .stripIndent() + +def helpMessage() { + log.info""" + Usage: + nextflow run ensembl-production/xrefProcess.nf + --release (mandatory) The Ensembl release. + + --source_db_url (mandatory) Database URL where information about xref sources is stored (created during xrefDownload pipeline). + Syntax: 'mysql://user:password@host:port/dbname' + + --base_path (mandatory) Path where log and species files will be stored, + a scratch space with sufficient storage is recommended. + + --registry_url (mandatory) Database URL on which the registry metaSearch API will be run. + Syntax: 'mysql://user:password@host:port/dbname' + + --xref_db_url (mandatory) Database URL where the species intermediate DBs will be created. + Syntax: 'mysql://user:password@host:port/ + + --run_all (optional) If set to 1, the pipeline will run on ALL species in registry. + Default: 0 + + --species (optional) Comma-separated list of species to run pipeline on. + Will be disregarded if --run_all is set to 1. Takes precedence over --division. + + --antispecies (optional) Comma-separated list of species to disregard in the run. + + --division (optional) Comma-separated list of divisions to run pipeline on. + Will be disregarded if --run_all is set to 1. + + --split_files_by_species (optional) If set to 1, UniProt and RefSeq file will be split according to taxonomy ID. + Default: 1 + + --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. + Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini + + --registry_file (mandatory) Path to the registry config file (used in perl scripts). + + --dc_config_file (mandatory) Path to the datachecks configuration file. + """.stripIndent() +} + +workflow { + // Check mandatory paremeters + if (params.help || !params.release || !params.source_db_url || !params.base_path || !params.registry_url || !params.xref_db_url || !params.registry_file || !params.dc_config_file) { + helpMessage() + + def required_params = [ + 'release' : params.release, + 'source_db_url' : params.source_db_url, + 'base_path' : params.base_path, + 'registry_url' : params.registry_url, + 'xref_db_url' : params.xref_db_url, + 'registry_file' : params.registry_file, + 'dc_config_file' : params.dc_config_file + ] + + required_params.each { param_name, param_value -> + if (!param_value) { + println """ + Missing required param '${param_name}' + """.stripIndent() + } + } + + exit 1 + } + + // Find the species in the registry + ScheduleSpecies() + timestamp = ScheduleSpecies.out[0] + species_info = ScheduleSpecies.out[1].splitText().map{it -> it.trim()} + + // Run the species flow for each species + species_flow(species_info, timestamp) + + // Send emails + EmailAdvisoryXrefReport(species_flow.out.collect(), timestamp) + NotifyByEmail(EmailAdvisoryXrefReport.out, timestamp) +} + +workflow species_flow { + take: + species_dataflow + timestamp + main: + // Extract the species name to create tuples + GetSpeciesName(species_dataflow) + + // Schedule primary sources to parse + ScheduleParse(GetSpeciesName.out, timestamp) + primary_sources_ch = process_output(ScheduleParse.out[0]) + schedule_secondary_ch = process_output(ScheduleParse.out[1]) + + // Parse primary sources + ParseSource(primary_sources_ch, timestamp) + + // Schedule secondary sources to parse + ScheduleSecondaryParse(schedule_secondary_ch, ParseSource.out.collect().count(), timestamp) + secondary_sources_ch = process_output(ScheduleSecondaryParse.out[0]) + schedule_tertiary_ch = process_output(ScheduleSecondaryParse.out[1]) + + // Parse secondary sources + ParseSecondarySource(secondary_sources_ch, timestamp) + + // Schedule tertiary sources to parse + ScheduleTertiaryParse(schedule_tertiary_ch, ParseSecondarySource.out.collect().count(), timestamp) + tertiary_sources_ch = process_output(ScheduleTertiaryParse.out[0]) + dump_enembl_ch = process_output(ScheduleTertiaryParse.out[1]) + + // Parse tertiary sources + ParseTertiarySource(tertiary_sources_ch, timestamp) + + // Dump ensembl sequences + DumpEnsembl(dump_enembl_ch, ParseTertiarySource.out.collect().count(), timestamp) + dump_xref_ch = process_output(DumpEnsembl.out[0]) + schedule_mapping_ch = process_output(DumpEnsembl.out[1]) + + // Dump xref sequences + DumpXref(dump_xref_ch, timestamp) + schedule_alignment_ch = process_output(DumpXref.out) + + // Schedule alignments + ScheduleAlignment(schedule_alignment_ch, timestamp) + alignment_ch = process_output(ScheduleAlignment.out) + + // Align dumps + Alignment(alignment_ch, timestamp) + + // Schedule mapping + ScheduleMapping(schedule_mapping_ch, Alignment.out.collect().count(), timestamp) + pre_mapping_ch = process_output(ScheduleMapping.out[0]) + mapping_ch = process_output(ScheduleMapping.out[1]) + + // Start pre-mapping steps + DirectXrefs(pre_mapping_ch, timestamp) + ProcessAlignment(DirectXrefs.out, timestamp) + + RnaCentralMapping(pre_mapping_ch, timestamp) + UniParcMapping(RnaCentralMapping.out, timestamp) + CoordinateMapping(UniParcMapping.out, timestamp) + + // Start mapping + Mapping(mapping_ch, ProcessAlignment.out.concat(CoordinateMapping.out).count(), timestamp) + + // Run datachecks + RunXrefCriticalDatacheck(Mapping.out) + RunXrefAdvisoryDatacheck(RunXrefCriticalDatacheck.out) + advisory_report_ch = process_output(RunXrefAdvisoryDatacheck.out) + + // Collect advisory datacheck outputs + AdvisoryXrefReport(advisory_report_ch, timestamp) + emit: + AdvisoryXrefReport.out +} + +def process_output(output_channel) { + return output_channel.flatMap { species_name, dataflow_file -> + def result = [] + for (line in dataflow_file.readLines()) { + result << tuple(species_name, line) + } + return result + } +} + +process ScheduleSpecies { + label 'small_process' + + output: + val timestamp + path 'dataflow_species.json' + + script: + timestamp = new java.util.Date().format("yyyyMMdd_HHmmss") + + shell: + cmd_params = "" + if (params.species) { + cmd_params = "${cmd_params} --species '${params.species}'" + } + if (params.antispecies) { + cmd_params = "${cmd_params} --antispecies '${params.antispecies}'" + } + if (params.division) { + cmd_params = "${cmd_params} --division '${params.division}'" + } + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleSpecies --registry_url ${params.registry_url} --run_all ${params.run_all} --release ${params.release} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + """ +} + +process GetSpeciesName { + label 'small_process' + + input: + val dataflow + + output: + tuple val(species_name), val(dataflow) + + shell: + species_name = (dataflow =~ /"species_name":\s*"([A-Za-z0-9_.-]+)"/)[0][1] + + """ + """ +} + +process ScheduleParse { + label 'small_process' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + tuple val(species_name), path('dataflow_primary_sources.json') + tuple val(species_name), path('dataflow_schedule_secondary.json') + + shell: + cmd_params = "" + if (params.split_files_by_species) { + cmd_params = "${cmd_params} --get_species_file 1" + } + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 1 --sources_config_file ${params.sources_config_file} --source_db_url ${params.source_db_url} --xref_db_url ${params.xref_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + """ +} + +process ParseSource { + label 'mem1GB' + tag "$species_name - $source_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val 'ParseSourceDone' + + shell: + source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ParseSource --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp + """ +} + +process ScheduleSecondaryParse { + label 'small_process' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val wait + val timestamp + + output: + tuple val(species_name), path('dataflow_secondary_sources.json') + tuple val(species_name), path('dataflow_schedule_tertiary.json') + + shell: + cmd_params = "" + if (params.split_files_by_species) { + cmd_params = "${cmd_params} --get_species_file 1" + } + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 2 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + """ +} + +process ParseSecondarySource { + label 'default_process' + tag "$species_name - $source_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val 'ParseSecondarySourceDone' + + shell: + source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ParseSource --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp + """ +} + +process ScheduleTertiaryParse { + label 'small_process' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val wait + val timestamp + + output: + tuple val(species_name), path('dataflow_tertiary_sources.json') + tuple val(species_name), path('dataflow_dump_ensembl.json') + + shell: + cmd_params = "" + if (params.split_files_by_species) { + cmd_params = "${cmd_params} --get_species_file 1" + } + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 3 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + """ +} + +process ParseTertiarySource { + label 'mem1GB' + tag "$species_name - $source_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val 'ParseTertiarySourceDone' + + shell: + source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ParseSource --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp + """ +} + +process DumpEnsembl { + label 'default_process' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val wait + val timestamp + + output: + tuple val(species_name), path('dataflow_dump_xref.json') + tuple val(species_name), path('dataflow_schedule_mapping.json') + + script: + def retry_flag = task.attempt > 1 ? "--retry 1" : "" + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DumpEnsembl --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} $retry_flag --log_timestamp $timestamp + """ +} + +process DumpXref { + label 'mem1GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + tuple val(species_name), path('dataflow_schedule_alignment.json') + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DumpXref --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --config_file ${params.config_file} --log_timestamp $timestamp + """ +} + +process ScheduleAlignment { + label 'small_process' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + tuple val(species_name), path('dataflow_alignment.json') + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleAlignment --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --log_timestamp $timestamp + """ +} + +process Alignment { + label 'align_mem' + tag "$species_name - $source_name ($source_id) - chunk $chunk" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val 'AlignmentDone' + + shell: + source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + source_id = (dataflow =~ /"source_id":\s*([0-9]+)/)[0][1] + chunk = (dataflow =~ /"chunk":\s*([0-9]+)/)[0][1] + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Alignment --dataflow '$dataflow' --base_path ${params.base_path} --log_timestamp $timestamp + """ +} + +process ScheduleMapping { + label 'small_process' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val wait + val timestamp + + output: + tuple val(species_name), path('dataflow_pre_mapping.json') + tuple val(species_name), path('dataflow_mapping.json') + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --log_timestamp $timestamp + """ +} + +process DirectXrefs { + label 'mem1GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + tuple val(species_name), val(dataflow) + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DirectXrefs --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --log_timestamp $timestamp + """ +} + +process ProcessAlignment { + label 'mem1GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val 'ProcessAlignmentDone' + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ProcessAlignment --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --log_timestamp $timestamp + """ +} + +process RnaCentralMapping { + label 'mem1GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + tuple val(species_name), val(dataflow) + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.RNACentralMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --source_db_url ${params.source_db_url} --log_timestamp $timestamp + """ +} + +process UniParcMapping { + label 'mem1GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + tuple val(species_name), val(dataflow) + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.UniParcMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --source_db_url ${params.source_db_url} --log_timestamp $timestamp + """ +} + +process CoordinateMapping { + label 'mem1GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val 'CoordinateMappingDone' + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.CoordinateMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --source_db_url ${params.source_db_url} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp + """ +} + +process Mapping { + label 'mem4GB' + tag "$species_name" + + input: + tuple val(species_name), val(dataflow) + val wait + val timestamp + + output: + val species_name + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Mapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --ignore_warnings ${params.ignore_warnings} --log_timestamp $timestamp + """ +} + +process RunXrefCriticalDatacheck { + label 'default_process' + tag "$species_name" + + input: + val species_name + + output: + val species_name + + """ + perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_names='ForeignKeys' -datacheck_groups='xref_mapping' -datacheck_types='critical' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=1 -species=$species_name + """ +} + +process RunXrefAdvisoryDatacheck { + label 'default_process' + tag "$species_name" + + input: + val species_name + + output: + tuple val(species_name), path('dataflow_4.json') + + """ + perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_groups='xref_mapping' -datacheck_types='advisory' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=0 -species=$species_name + """ +} + +process AdvisoryXrefReport { + label 'default_process' + tag "$species_name - $dc_name" + + input: + tuple val(species_name), val(dataflow) + val timestamp + + output: + val species_name + + shell: + dc_name = (dataflow =~ /"datacheck_name":\s*"([A-Za-z]+)"/)[0][1] + + script: + formatted_dataflow = dataflow.replace("'", '__') + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.AdvisoryXrefReport --dataflow '$formatted_dataflow' --release ${params.release} --base_path ${params.base_path} --species_name $species_name --log_timestamp $timestamp + """ +} + +process EmailAdvisoryXrefReport { + label 'default_process' + + input: + val wait + val timestamp + + output: + val 'done' + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailAdvisoryXrefReport --release ${params.release} --base_path ${params.base_path} --pipeline_name '${params.pipeline_name}' --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp + """ +} + +process NotifyByEmail { + label 'small_process' + + input: + val wait + val timestamp + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --release ${params.release} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp + """ +} \ No newline at end of file diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index 3beabbcd6..cb92281a3 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -55,7 +55,7 @@ $log_file = catfile($log_path, "tmp_logfile_CleanupSplitSource_".int(rand(500))); add_to_log_file($log_file, "CleanupSplitSource starting for source $source_name"); - add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file"); + add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file") if $tax_ids_file; } # Do nothing if not a uniprot or refseq source @@ -288,4 +288,4 @@ sub add_to_log_file { print $fh "$current_timestamp | INFO | $message\n"; close($fh); } -} +} \ No newline at end of file diff --git a/scripts/xrefs/cleanup_source.pl b/scripts/xrefs/cleanup_source.pl index 5ce29a0f5..1226e6e1c 100644 --- a/scripts/xrefs/cleanup_source.pl +++ b/scripts/xrefs/cleanup_source.pl @@ -232,4 +232,4 @@ sub add_to_log_file { print $fh "$current_timestamp | INFO | $message\n"; close($fh); } -} +} \ No newline at end of file diff --git a/scripts/xrefs/coordinate_mapper.pl b/scripts/xrefs/coordinate_mapper.pl new file mode 100644 index 000000000..76c06775f --- /dev/null +++ b/scripts/xrefs/coordinate_mapper.pl @@ -0,0 +1,531 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2024] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; +use Data::Dumper; +use Carp; +use DBI; +use JSON; +use Getopt::Long; + +use Nextflow::Utils; +use Bio::EnsEMBL::DBSQL::DBAdaptor +use Bio::EnsEMBL::Mapper::RangeRegistry; + +my ($xref_db_url, $core_db_url, $species_id, $output_dir, $analysis_id); +GetOptions( + 'xref_db_url=s' => \$xref_db_url, + 'core_db_url=s' => \$core_db_url, + 'species_id=i' => \$species_id, + 'output_dir=s' => \$output_dir, + 'analysis_id=i' => \$analysis_id +); + +# Check that all parameters are passed +if (!defined($xref_db_url) || !defined($core_db_url) || !defined($species_id) || !defined($output_dir) || !defined($analysis_id)) { + croak "Usage: dump_ensembl.pl --xref_db_url --core_db_url --species_id --output_dir --analysis_id "; +} + +# Set the files to use +my $xref_filename = catfile($output_dir, 'xref_coord.txt'); +my $object_xref_filename = catfile($output_dir, 'object_xref_coord.txt'); +my $unmapped_reason_filename = catfile($output_dir, 'unmapped_reason_coord.txt'); +my $unmapped_object_filename = catfile($output_dir, 'unmapped_object_coord.txt'); + +# Connect tp dbs +my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url); +my $core_dbi = get_dbi($core_host, $core_port, $core_user, $core_pass, $core_dbname); +my $xref_dbi = get_dbi(parse_url($xref_db_url)); + +# Figure out the last used IDs in the core DB +my $xref_id = $core_dbi->selectall_arrayref('SELECT MAX(xref_id) FROM xref')->[0][0]; +my $object_xref_id = $core_dbi->selectall_arrayref('SELECT MAX(object_xref_id) FROM object_xref')->[0][0]; +my $unmapped_object_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_object_id) FROM unmapped_object')->[0][0]; +my $unmapped_reason_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_reason_id) FROM unmapped_reason')->[0][0]; + +my (%unmapped, %mapped); +my $external_db_id; + +# Read and store available Xrefs from the Xref database +my $xref_sth = $xref_dbi->prepare("SELECT c.coord_xref_id,s.name,c.accession FROM coordinate_xref c,source s WHERE c.source_id=s.source_id AND c.species_id=?"); +$xref_sth->bind_param(1, $species_id, SQL_INTEGER); +$xref_sth->execute(); + +while (my $xref = $xref_sth->fetchrow_hashref()) { + $external_db_id ||= $core_dbi->selectall_arrayref('SELECT external_db_id FROM external_db WHERE db_name='.$xref->{'name'})->[0][0]; + $external_db_id ||= 11000; # FIXME (11000 is 'UCSC') + + $unmapped{$xref->{'coord_xref_id'}} = { + 'external_db_id' => $external_db_id, + 'accession' => $xref->{'accession'}, + 'reason' => 'No overlap', + 'reason_full' => 'No coordinate overlap with any Ensembl transcript' + }; +} +$xref_sth->finish(); + +if (!defined($external_db_id)) { + die "External_db_id is undefined for species_id = $species_id\n"; +} + +# Start the coordinate matching +my $core_db_adaptor = Bio::EnsEMBL::DBSQL::DBAdaptor->new( + -host => $core_host, + -port => $core_port, + -user => $core_user, + -pass => $core_pass, + -dbname => $core_dbname, +); + +my $slice_adaptor = $core_db_adaptor->get_SliceAdaptor(); +my @chromosomes = @{ $slice_adaptor->fetch_all('Chromosome') }; + +my $sql = qq( + SELECT coord_xref_id, accession, + txStart, txEnd, + cdsStart, cdsEnd, + exonStarts, exonEnds + FROM coordinate_xref + WHERE species_id = ? + AND chromosome = ? AND strand = ? + AND ((txStart BETWEEN ? AND ?) -- txStart in region + OR (txEnd BETWEEN ? AND ?) -- txEnd in region + OR (txStart <= ? AND txEnd >= ?)) -- region is fully contained + ORDER BY accession +); + +foreach my $chromosome (@chromosomes) { + my $chr_name = $chromosome->seq_region_name(); + my @genes = @{ $chromosome->get_all_Genes( undef, undef, 1 ) }; + + while (my $gene = shift(@genes)) { + my @transcripts = @{ $gene->get_all_Transcripts() }; + my %gene_result; + + foreach my $transcript (sort { $a->start() <=> $b->start() } @transcripts) { + ################################################################ + # For each Ensembl transcript: # + # 1. Register all Ensembl exons in a RangeRegistry. # + # # + # 2. Find all transcripts in the external database that are # + # within the range of this Ensembl transcript. # + # # + # For each of those external transcripts: # + # 3. Calculate the overlap of the exons of the external # + # transcript with the Ensembl exons using the # + # overlap_size() method in the RangeRegistry. # + # # + # 4. Register the external exons in their own RangeRegistry. # + # # + # 5. Calculate the overlap of the Ensembl exons with the # + # external exons as in step 3. # + # # + # 6. Calculate the match score. # + # # + # 7. Decide whether or not to keep the match. # + ################################################################ + + my @exons = @{ $transcript->get_all_Exons() }; + my %transcript_result; + + # '$rr1' is the RangeRegistry holding Ensembl exons for one transcript at a time. + my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + my $coding_transcript; + if (defined($transcript->translation())) { + $coding_transcript = 1; + } else { + $coding_transcript = 0; + } + + foreach my $exon (@exons) { + # Register each exon in the RangeRegistry. Register both the + # total length of the exon and the coding range of the exon. + $rr1->check_and_register('exon', $exon->start(), $exon->end()); + + if ($coding_transcript + && defined($exon->coding_region_start($transcript)) + && defined($exon->coding_region_end($transcript) )) + { + $rr1->check_and_register('coding', $exon->coding_region_start($transcript), $exon->coding_region_end($transcript)); + } + } + + # Get hold of all transcripts from the external database that + # overlaps with this Ensembl transcript. + + my $sth = $xref_dbi->prepare_cached($sql); + $sth->bind_param(1, $species_id, SQL_INTEGER); + $sth->bind_param(2, $chr_name, SQL_VARCHAR); + $sth->bind_param(3, $gene->strand(), SQL_INTEGER); + $sth->bind_param(4, $transcript->start(), SQL_INTEGER); + $sth->bind_param(5, $transcript->end(), SQL_INTEGER); + $sth->bind_param(6, $transcript->start(), SQL_INTEGER); + $sth->bind_param(7, $transcript->end(), SQL_INTEGER); + $sth->bind_param(8, $transcript->start(), SQL_INTEGER); + $sth->bind_param(9, $transcript->end(), SQL_INTEGER); + $sth->execute(); + + my ($coord_xref_id, $accession, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonStarts, $exonEnds); + + $sth->bind_columns(\($coord_xref_id, $accession, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonStarts, $exonEnds)); + + while ($sth->fetch()) { + my @exonStarts = split(/,\s*/, $exonStarts); + my @exonEnds = split(/,\s*/, $exonEnds); + my $exonCount = scalar(@exonStarts); + + # '$rr2' is the RangeRegistry holding exons from the external + # transcript, for one transcript at a time. + my $rr2 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + my $exon_match = 0; + my $coding_match = 0; + my $coding_count = 0; + + for (my $i = 0 ; $i < $exonCount ; ++$i) { + # Register the exons from the external database in the same + # was as with the Ensembl exons, and calculate the overlap + # of the external exons with the previously registered + # Ensembl exons. + + my $overlap = $rr1->overlap_size('exon', $exonStarts[$i], $exonEnds[$i]); + $exon_match += $overlap/($exonEnds[$i] - $exonStarts[$i] + 1); + $rr2->check_and_register('exon', $exonStarts[$i], $exonEnds[$i]); + + if (!defined($cdsStart) || !defined($cdsEnd)) { + # Non-coding transcript. + } else { + my $codingStart = ($exonStarts[$i] > $cdsStart ? $exonStarts[$i] : $cdsStart); + my $codingEnd = ($exonEnds[$i] < $cdsEnd ? $exonEnds[$i] : $cdsEnd); + + if ($codingStart < $codingEnd) { + my $coding_overlap = $rr1->overlap_size('coding', $codingStart, $codingEnd); + $coding_match += $coding_overlap/($codingEnd - $codingStart + 1); + $rr2->check_and_register('coding', $codingStart, $codingEnd); + + ++$coding_count; + } + } + } + + my $rexon_match = 0; + my $rcoding_match = 0; + my $rcoding_count = 0; + + foreach my $exon (@exons) { + # Calculate the overlap of the Ensembl exons with the + # external exons. + + my $overlap = $rr2->overlap_size('exon', $exon->start(), $exon->end()); + $rexon_match += $overlap/($exon->end() - $exon->start() + 1); + + if ($coding_transcript + && defined($exon->coding_region_start($transcript)) + && defined($exon->coding_region_end($transcript) )) + { + my $coding_overlap = $rr2->overlap_size('coding', $exon->coding_region_start($transcript), $exon->coding_region_end($transcript)); + + $rcoding_match += $coding_overlap/($exon->coding_region_end($transcript) - $exon->coding_region_start($transcript) + 1); + + ++$rcoding_count; + } + } + + # Calculate the match score. + my $score = ( + ($exon_match + $ens_weight*$rexon_match) + + $coding_weight*($coding_match + $ens_weight*$rcoding_match) + )/ + ( + ($exonCount + $ens_weight*scalar(@exons)) + + $coding_weight*($coding_count + $ens_weight*$rcoding_count) + ); + + if (!defined( $transcript_result{$coord_xref_id}) || $transcript_result{$coord_xref_id} < $score) { + $transcript_result{$coord_xref_id} = $score; + } + + } + $sth->finish(); + + # Apply transcript threshold and pick the best match(es) for + # this transcript. + + my $best_score; + foreach my $coord_xref_id (sort( { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result) )) { + my $score = $transcript_result{$coord_xref_id}; + + if ($score > $transcript_score_threshold) { + $best_score ||= $score; + + if (sprintf("%.3f", $score) eq sprintf("%.3f", $best_score)) { + if (exists( $unmapped{$coord_xref_id})) { + $mapped{$coord_xref_id} = $unmapped{$coord_xref_id}; + delete( $unmapped{$coord_xref_id} ); + $mapped{$coord_xref_id}{'reason'} = undef; + $mapped{$coord_xref_id}{'reason_full'} = undef; + $mapped{$coord_xref_id}{'chr_name'} = $chr_name; + } + + push(@{ $mapped{$coord_xref_id}{'mapped_to'}}, { + 'ensembl_id' => $transcript->dbID(), + 'ensembl_object_type' => 'Transcript' + }); + + # This is now a candidate Xref for the gene. + if (!defined( $gene_result{$coord_xref_id}) || $gene_result{$coord_xref_id} < $score) { + $gene_result{$coord_xref_id} = $score; + } + } elsif (exists($unmapped{$coord_xref_id})) { + $unmapped{$coord_xref_id}{'reason'} = 'Was not best match'; + $unmapped{$coord_xref_id}{'reason_full'} = sprintf("Did not top best transcript match score (%.2f)", $best_score); + if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) { + $unmapped{$coord_xref_id}{'score'} = $score; + $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID(); + } + } + } elsif (exists( $unmapped{$coord_xref_id}) && $unmapped{$coord_xref_id}{'reason'} ne 'Was not best match') { + $unmapped{$coord_xref_id}{'reason'} = 'Did not meet threshold'; + $unmapped{$coord_xref_id}{'reason_full'} = sprintf( "Match score for transcript lower than threshold (%.2f)", $transcript_score_threshold); + if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) { + $unmapped{$coord_xref_id}{'score'} = $score; + $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID(); + } + } + } + } + } +} + +# Make all dumps. Order is important. +dump_xref($xref_filename, $xref_id, \%mapped, \%unmapped); +dump_object_xref($object_xref_filename, $object_xref_id, $analysis_id, \%mapped); +dump_unmapped_reason($unmapped_reason_filename, $unmapped_reason_id, \%unmapped, $core_dbi); +dump_unmapped_object($unmapped_object_filename, $unmapped_object_id, $analysis_id, \%unmapped); + +# Upload the dumps. Order is important. +upload_data('unmapped_reason', $unmapped_reason_filename, $external_db_id, $core_dbi); +upload_data('unmapped_object', $unmapped_object_filename, $external_db_id, $core_dbi); +upload_data('object_xref', $object_xref_filename, $external_db_id, $core_dbi); +upload_data('xref', $xref_filename, $external_db_id, $core_dbi); + +sub parse_url { + my ($url) = @_; + + my $parsed_url = Nextflow::Utils::parse($url); + my $user = $parsed_url->{'user'}; + my $pass = $parsed_url->{'pass'}; + my $host = $parsed_url->{'host'}; + my $port = $parsed_url->{'port'}; + my $db = $parsed_url->{'dbname'}; + + return ($host, $port, $user, $pass, $db) +} + +sub get_dbi { + my ($host, $port, $user, $pass, $dbname) = @_; + + my $dbconn; + if (defined $dbname) { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); + } else { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + } + my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); + + return $dbi; +} + +sub dump_xref { + my ($filename, $xref_id, $mapped, $unmapped) = @_; + + my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + + foreach my $xref (values(%{$unmapped}), values(%{$mapped})) { + # Assign 'xref_id' to this Xref. + $xref->{'xref_id'} = ++$xref_id; + + my $accession = $xref->{'accession'}; + my ($version) = ($accession =~ /\.(\d+)$/); + $version ||= 0; + + my $info_text = (defined($xref->{'chr_name'}) && $xref->{'chr_name'} eq 'Y' ? "Y Chromosome" : ""); + + $fh->printf("%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n", + $xref->{'xref_id'}, + $xref->{'external_db_id'}, + $accession, + $accession, + $version, + '\N', + 'COORDINATE_OVERLAP', + $info_text + ); + } + $fh->close(); +} + +sub dump_object_xref { + my ($filename, $object_xref_id, $analysis_id, $mapped) = @_; + + my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + + foreach my $xref (values(%{$mapped})) { + foreach my $object_xref (@{ $xref->{'mapped_to'} }) { + # Assign 'object_xref_id' to this Object Xref. + $object_xref->{'object_xref_id'} = ++$object_xref_id; + + $fh->printf("%d\t%d\t%s\t%d\t%s\t%s\n", + $object_xref->{'object_xref_id'}, + $object_xref->{'ensembl_id'}, + $object_xref->{'ensembl_object_type'}, + $xref->{'xref_id'}, + '\N', + $analysis_id + ); + } + } + $fh->close(); +} + +sub dump_unmapped_reason { + my ($filename, $unmapped_reason_id, $unmapped, $core_dbi) = @_; + + # Create a list of the unique reasons. + my %reasons; + + foreach my $xref (values(%{$unmapped})) { + if (!exists($reasons{$xref->{'reason_full'}})) { + $reasons{$xref->{'reason_full'}} = { + 'summary' => $xref->{'reason'}, + 'full' => $xref->{'reason_full'} + }; + } + } + + my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + + my $sth = $core_dbi->prepare('SELECT unmapped_reason_id FROM unmapped_reason WHERE full_description = ?'); + + foreach my $reason (sort({ $a->{'full'} cmp $b->{'full'} } values(%reasons))) { + # Figure out 'unmapped_reason_id' from the core database. + $sth->bind_param(1, $reason->{'full'}, SQL_VARCHAR); + $sth->execute(); + + my $id; + $sth->bind_col(1, \$id); + $sth->fetch(); + + if (defined($id)) { + $reason->{'unmapped_reason_id'} = $id; + } else { + $reason->{'unmapped_reason_id'} = ++$unmapped_reason_id; + } + + $sth->finish(); + + $fh->printf("%d\t%s\t%s\n", + $reason->{'unmapped_reason_id'}, + $reason->{'summary'}, + $reason->{'full'} + ); + + } + $fh->close(); + + # Assign reasons to the unmapped Xrefs from %reasons. + foreach my $xref (values(%{$unmapped})) { + $xref->{'reason'} = $reasons{$xref->{'reason_full'}}; + $xref->{'reason_full'} = undef; + } +} + +sub dump_unmapped_object { + my ($filename, $unmapped_object_id, $analysis_id, $unmapped) = @_; + + my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + + foreach my $xref (values(%{$unmapped})) { + # Assign 'unmapped_object_id' to this Xref. + $xref->{'unmapped_object_id'} = ++$unmapped_object_id; + + $fh->printf( + "%d\t%s\t%s\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n", + $xref->{'unmapped_object_id'}, + 'xref', + $analysis_id || '\N', # '\N' (NULL) means no analysis exists and uploading this table will fail. + $xref->{'external_db_id'}, + $xref->{'accession'}, + $xref->{'reason'}->{'unmapped_reason_id'}, + (defined($xref->{'score'}) ? sprintf("%.3f", $xref->{'score'}) : '\N'), + '\N', + $xref->{'ensembl_id'} || '\N', + (defined($xref->{'ensembl_id'}) ? 'Transcript' : '\N'), + '\N' + ); + } + $fh->close(); +} + +sub upload_data { + my ($table_name, $filename, $external_db_id, $dbi) = @_; + + if (!-r $filename) { + croak(sprintf("Can not open '%s' for reading", $filename)); + } + + my $cleanup_sql = ''; + if ($table_name eq 'unmapped_reason') { + $cleanup_sql = qq( + DELETE ur + FROM unmapped_object uo, + unmapped_reason ur + WHERE uo.external_db_id = ? + AND ur.unmapped_reason_id = uo.unmapped_reason_id + ); + } elsif ($table_name eq 'unmapped_object') { + $cleanup_sql = qq( + DELETE uo + FROM unmapped_object uo + WHERE uo.external_db_id = ? + ); + } elsif ($table_name eq 'object_xref') { + $cleanup_sql = qq( + DELETE ox + FROM xref x, + object_xref ox + WHERE x.external_db_id = ? + AND ox.xref_id = x.xref_id + ); + } elsif ($table_name eq 'xref') { + $cleanup_sql = qq( + DELETE x + FROM xref x + WHERE x.external_db_id = ? + ); + } else { + croak(sprintf("Table '%s' is unknown\n", $table_name)); + } + + my $load_sql = sprintf("LOAD DATA LOCAL INFILE ? REPLACE INTO TABLE %s", $table_name); + + my $rows = $dbi->do($cleanup_sql, undef, $external_db_id) or croak($dbi->strerr()); + + $rows = $dbi->do($load_sql, undef, $filename) or croak($dbi->errstr()); + + $dbi->do("OPTIMIZE TABLE $table_name") or croak($dbi->errstr()); +} \ No newline at end of file diff --git a/scripts/xrefs/dump_ensembl.pl b/scripts/xrefs/dump_ensembl.pl new file mode 100644 index 000000000..22132195d --- /dev/null +++ b/scripts/xrefs/dump_ensembl.pl @@ -0,0 +1,86 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2024] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; +use Data::Dumper; +use IO::File; +use Getopt::Long; +use Carp; + +use Nextflow::Utils; +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Utils::IO::FASTASerializer; + +my ($cdna_path, $pep_path, $species, $core_db_url, $release); +GetOptions( + 'cdna_path=s' => \$cdna_path, + 'pep_path=s' => \$pep_path, + 'species=s' => \$species, + 'core_db_url=s' => \$core_db_url, + 'release=s' => \$release +); + +# Check that all parameters are passed +if (!defined($cdna_path) || !defined($pep_path) || !defined($species) || !defined($core_db_url) || !defined($release)) { + croak "Usage: dump_ensembl.pl --cdna_path --pep_path --species --core_db_url --release "; +} + +# Open fasta files for writing +my $cdna_fh = IO::File->new($cdna_path ,'w') || throw("Cannot create filehandle $cdna_path"); +my $cdna_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($cdna_fh); +my $pep_fh = IO::File->new($pep_path ,'w') || throw("Cannot create filehandle $pep_path"); +my $pep_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($pep_fh); + +# Load the registry +my ($user, $pass, $host, $port, $dbname) = parse_url($core_db_url); +my $registry = 'Bio::EnsEMBL::Registry'; +my %registry_params = (-HOST => $host, -PORT => $port, -USER => $user, -DB_VERSION => $release); +$registry_params{-PASS} = $pass if ($pass); +$registry->load_registry_from_db(%registry_params); + +# Get transcripts +my $transcript_adaptor = $registry->get_adaptor($species, 'Core', 'Transcript'); +my $transcript_list = $transcript_adaptor->fetch_all(); + +# Dump sequence data +while (my $transcript = shift @$transcript_list) { + my $sequence = $transcript->seq(); + $sequence->id($transcript->dbID()); + $cdna_writer->print_Seq($sequence); + + # Get and dump translation data + my $translation = $transcript->translation; + if ($translation) { + $sequence = $transcript->translate; + $sequence->id($translation->dbID()); + $pep_writer->print_Seq($sequence); + } +} + +# Close file handles +$cdna_fh->close; +$pep_fh->close; + +sub parse_url { + my ($url) = @_; + my $parsed_url = Nextflow::Utils::parse($url); + my $user = $parsed_url->{'user'}; + my $pass = $parsed_url->{'pass'}; + my $host = $parsed_url->{'host'}; + my $port = $parsed_url->{'port'}; + my $db = $parsed_url->{'dbname'}; + return ($user, $pass, $host, $port, $db); +} \ No newline at end of file diff --git a/scripts/xrefs/refseq_coordinate_parser.pl b/scripts/xrefs/refseq_coordinate_parser.pl new file mode 100644 index 000000000..808284ee4 --- /dev/null +++ b/scripts/xrefs/refseq_coordinate_parser.pl @@ -0,0 +1,468 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2024] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; +use Data::Dumper; +use Carp; +use DBI; +use JSON; +use Getopt::Long; + +use Nextflow::Utils; +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Mapper::RangeRegistry; + +my ($xref_db_url, $core_db_url, $otherf_db_url, $source_ids_json, $species_id, $species_name, $release); +GetOptions( + 'xref_db_url=s' => \$xref_db_url, + 'core_db_url=s' => \$core_db_url, + 'otherf_db_url=s' => \$otherf_db_url, + 'source_ids=s' => \$source_ids_json, + 'species_id=i' => \$species_id, + 'species_name=s' => \$species_name, + 'release=i' => \$release +); + +# Check that all parameters are passed +if (!defined($xref_db_url) || !defined($core_db_url) || !defined($otherf_db_url) || !defined($source_ids_json) || !defined($species_id) || !defined($species_name) || !defined($release)) { + croak "Usage: dump_ensembl.pl --xref_db_url --core_db_url --otherf_db_url --source_ids --species_id --species_name --release "; +} + +my $transcript_score_threshold = 0.75; +my $tl_transcript_score_threshold = 0.75; + +# Extract the source ids +my $source_ids = decode_json($source_ids_json); + +# Connect to the xref db +my ($user, $pass, $host, $port, $xref_db) = parse_url($xref_db_url); +my $dbi = get_dbi($host, $port, $user, $pass, $xref_db); + +# Load the registry +my $registry = 'Bio::EnsEMBL::Registry'; +my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url); +my ($otherf_user, $otherf_pass, $otherf_host, $otherf_port, $otherf_dbname) = parse_url($otherf_db_url); +$registry->load_registry_from_multiple_dbs( + { + -host => $core_host, + -port => $core_port, + -user => $core_user, + -pass => $core_pass || '', + -fb_version => $release + }, + { + -host => $otherf_host, + -port => $otherf_port, + -user => $otherf_user, + -pass => $otherf_pass || '', + -fb_version => $release + }, +); + +# Get the EntrezGene and WikiGene accessions +my (%entrez_ids) = %{ get_valid_codes("EntrezGene", $species_id, $dbi) }; +my (%wiki_ids) = %{ get_valid_codes('WikiGene', $species_id, $dbi) }; + +# Prepare link sql +my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?,?)"); + +# Get the db adaptors +my $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures'); +my $core_dba = $otherf_dba->dnadb(); + +# Get the slice adaptors +my $otherf_sa = $otherf_dba->get_SliceAdaptor(); +my $core_sa = $core_dba->get_SliceAdaptor(); + +# Fetch analysis object for refseq +my $logic_name; +my $otherf_aa = $otherf_dba->get_AnalysisAdaptor(); +foreach my $analysis_adaptor (@{ $otherf_aa->fetch_all() }) { + if ($analysis_adaptor->logic_name =~ /refseq_import/) { + $logic_name = $analysis_adaptor->logic_name; + } +} + +# Not all species have refseq_import data, skip if not found +if (!defined $logic_name) { + print STDERR "No data found for RefSeq_import, skipping import\n";; + return; +} + +# Get otherfeatures chromosomes +my $otherf_chromosomes = $otherf_sa->fetch_all('toplevel', undef, 1); +foreach my $otherf_chromosome (@$otherf_chromosomes) { + my $chr_name = $otherf_chromosome->seq_region_name(); + + # Get otherfeatures genes + my $otherf_genes = $otherf_chromosome->get_all_Genes($logic_name, undef, 1); + while (my $otherf_gene = shift @$otherf_genes) { + # Get otherfeatures transcripts + my $otherf_transcripts = $otherf_gene->get_all_Transcripts(); + foreach my $otherf_transcript (sort { $a->start() <=> $b->start() } @$otherf_transcripts) { + # Get the RefSeq accession (either the display xref or the stable ID) + my $refseq_acc; + if (defined $otherf_transcript->display_xref) { + $refseq_acc = $otherf_transcript->display_xref->display_id; + } elsif (defined $otherf_transcript->stable_id) { + $refseq_acc = $otherf_transcript->stable_id; + } else { + # Skip non conventional accessions + next; + } + next if (!defined($refseq_acc) || $refseq_acc !~ /^[NXMR]{2}_[0-9]+/); + + my (%transcript_result, %tl_transcript_result); + my ($start, $end, $overlap); + + # Get otherfeatures exons + my $otherf_exons = $otherf_transcript->get_all_Exons(); + my $otherf_tl_exons = $otherf_transcript->get_all_translateable_Exons(); + + # Create a range registry for all the exons of the refseq transcript + my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + my $rr3 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + foreach my $otherf_exon (@$otherf_exons) { + $start = $otherf_exon->seq_region_start(); + $end = $otherf_exon->seq_region_end(); + $rr1->check_and_register('exon', $start, $end); + } + + foreach my $otherf_tl_exon (@$otherf_tl_exons) { + $start = $otherf_tl_exon->seq_region_start(); + $end = $otherf_tl_exon->seq_region_end(); + $rr3->check_and_register('exon', $start, $end); + } + + # Fetch slice in core database which overlaps refseq transcript + my $core_chromosome = $core_sa->fetch_by_region('toplevel', $chr_name, $otherf_transcript->seq_region_start, $otherf_transcript->seq_region_end); + + # Get core transcripts + my $core_transcripts = $core_chromosome->get_all_Transcripts(1); + foreach my $core_transcript (@$core_transcripts) { + next if ($core_transcript->strand != $otherf_transcript->strand); + + # Get core exons + my $core_exons = $core_transcript->get_all_Exons(); + my $core_tl_exons = $core_transcript->get_all_translateable_Exons(); + + # Create a range registry for all the exons of the ensembl transcript + my $rr2 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + my $rr4 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + my ($core_exon_match, $core_tl_exon_match, $otherf_exon_match, $otherf_tl_exon_match) = (0, 0, 0, 0); + + foreach my $core_exon (@$core_exons) { + $start = $core_exon->seq_region_start(); + $end = $core_exon->seq_region_end(); + $overlap = $rr1->overlap_size('exon', $start, $end); + $core_exon_match += $overlap/($end - $start + 1); + $rr2->check_and_register('exon', $start, $end); + } + + foreach my $core_tl_exon (@$core_tl_exons) { + $start = $core_tl_exon->seq_region_start(); + $end = $core_tl_exon->seq_region_end(); + $overlap = $rr3->overlap_size('exon', $start, $end); + $core_tl_exon_match += $overlap/($end - $start + 1); + $rr4->check_and_register('exon', $start, $end); + } + + # Look for oeverlap between the two sets of exons + foreach my $otherf_exon (@$otherf_exons) { + $start = $otherf_exon->seq_region_start(); + $end = $otherf_exon->seq_region_end(); + $overlap = $rr2->overlap_size('exon', $start, $end); + $otherf_exon_match += $overlap/($end - $start + 1); + } + + foreach my $otherf_tl_exon (@$otherf_tl_exons) { + $start = $otherf_tl_exon->seq_region_start(); + $end = $otherf_tl_exon->seq_region_end(); + $overlap = $rr4->overlap_size('exon', $start, $end); + $otherf_tl_exon_match += $overlap/($end - $start + 1); + } + + # Compare exon matching with number of exons to give a score + my $score = ( ($otherf_exon_match + $core_exon_match)) / (scalar(@$otherf_exons) + scalar(@$core_exons) ); + my $tl_score = 0; + if (scalar(@$otherf_tl_exons) > 0) { + $tl_score = ( ($otherf_tl_exon_match + $core_tl_exon_match)) / (scalar(@$otherf_tl_exons) + scalar(@$core_tl_exons) ); + } + if ($core_transcript->biotype eq $otherf_transcript->biotype) { + $transcript_result{$core_transcript->stable_id} = $score; + $tl_transcript_result{$core_transcript->stable_id} = $tl_score; + } else { + $transcript_result{$core_transcript->stable_id} = $score * 0.90; + $tl_transcript_result{$core_transcript->stable_id} = $tl_score * 0.90; + } + } + + my ($best_score, $best_tl_score) = (0, 0); + my ($best_id, $score, $tl_score); + + # Compare the scores based on coding exon overlap + # If there is a stale mate, chose best exon overlap score + foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) { + $score = $transcript_result{$tid}; + $tl_score = $tl_transcript_result{$tid}; + + if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) { + if ($tl_score >= $best_tl_score) { + if ($tl_score > $best_tl_score) { + $best_id = $tid; + $best_score = $score; + $best_tl_score = $tl_score; + } elsif ($tl_score == $best_tl_score) { + if ($score > $best_score) { + $best_id = $tid; + $best_score = $score; + } + } + } + if (!defined $best_id) { + if ($score >= $best_score) { + $best_id = $tid; + $best_score = $score; + } + } + } + } + + # If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript + if ($best_id) { + my ($acc, $version) = split(/\./, $refseq_acc); + $version =~ s/\D//g if $version; + + # Set the appropriate source ID + my $source_id; + $source_id = $source_ids->{'mrna'} if $acc =~ /^NM_/; + $source_id = $source_ids->{'ncrna'} if $acc =~ /^NR_/; + $source_id = $source_ids->{'mrna_predicted'} if $acc =~ /^XM_/; + $source_id = $source_ids->{'ncrna_predicted'} if $acc =~ /^XR_/; + next if (!defined($source_id)); + + my $xref_id = add_xref({ + acc => $acc, + version => $version, + label => $refseq_acc, + desc => undef, + source_id => $source_id, + species_id => $species_id, + dbi => $dbi, + info_type => 'DIRECT' + }); + add_direct_xref($xref_id, $best_id, "Transcript", "", $dbi); + + my $otherf_gene = $otherf_transcript->get_Gene(); + my $entrez_id = $otherf_gene->stable_id(); + my $otherf_translation = $otherf_transcript->translation(); + my $core_ta = $core_dba->get_TranscriptAdaptor(); + my $transcript = $core_ta->fetch_by_stable_id($best_id); + my $translation = $transcript->translation(); + + # Add link between Ensembl gene and EntrezGene (and WikiGene) + if (defined $entrez_ids{$entrez_id} ) { + foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) { + $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $source_ids->{'entrezgene'}); + } + foreach my $dependent_xref_id (@{$wiki_ids{$entrez_id}}) { + $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $source_ids->{'wikigene'}); + } + } + + # Also store refseq protein as direct xref for ensembl translation, if translation exists + if (defined $translation && defined $otherf_translation && ($otherf_translation->seq eq $translation->seq)) { + my $translation_id = $otherf_translation->stable_id(); + my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$otherf_translation->get_all_DBEntries}; + if (scalar @xrefs == 1) { + $translation_id = $xrefs[0]->primary_id(); + } + + ($acc, $version) = split(/\./, $translation_id); + + $source_id = $source_ids->{'peptide'}; + $source_id = $source_ids->{'peptide_predicted'} if $acc =~ /^XP_/; + my $tl_xref_id = add_xref({ + acc => $acc, + version => $version, + label => $translation_id, + desc => undef, + source_id => $source_id, + species_id => $species_id, + dbi => $dbi, + info_type => 'DIRECT' + }); + add_direct_xref($tl_xref_id, $translation->stable_id(), "Translation", "", $dbi); + } + } + } + } +} + +sub parse_url { + my ($url) = @_; + + my $parsed_url = Nextflow::Utils::parse($url); + my $user = $parsed_url->{'user'}; + my $pass = $parsed_url->{'pass'}; + my $host = $parsed_url->{'host'}; + my $port = $parsed_url->{'port'}; + my $db = $parsed_url->{'dbname'}; + + return ($user, $pass, $host, $port, $db); +} + +sub get_dbi { + my ($host, $port, $user, $pass, $dbname) = @_; + + my $dbconn; + if (defined $dbname) { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); + } else { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + } + my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); + + return $dbi; +} + +sub get_valid_codes{ + my ($source_name, $species_id, $dbi) = @_; + + my %valid_codes; + my @sources; + + my $big_name = uc $source_name; + my $sql = "select source_id from source where upper(name) like '%$big_name%'"; + my $sth = $dbi->prepare($sql); + $sth->execute(); + while(my @row = $sth->fetchrow_array()){ + push @sources,$row[0]; + } + $sth->finish; + + foreach my $source (@sources){ + $sql = "select accession, xref_id from xref where species_id = $species_id and source_id = $source"; + $sth = $dbi->prepare($sql); + $sth->execute(); + while(my @row = $sth->fetchrow_array()){ + push @{$valid_codes{$row[0]}}, $row[1]; + } + } + $sth->finish(); + + return \%valid_codes; +} + +sub add_xref { + my ($arg_ref) = @_; + + my $acc = $arg_ref->{acc} || croak 'add_xref needs aa acc'; + my $source_id = $arg_ref->{source_id} || croak 'add_xref needs a source_id'; + my $species_id = $arg_ref->{species_id} || croak 'add_xref needs a species_id'; + my $label = $arg_ref->{label} // $acc; + my $description = $arg_ref->{desc}; + my $version = $arg_ref->{version} // 0; + my $info_type = $arg_ref->{info_type} // 'MISC'; + my $info_text = $arg_ref->{info_text} // q{}; + my $dbi = $arg_ref->{dbi}; + + # See if it already exists. If so return the existing xref_id + my $xref_id; + my $get_xref_sth = $dbi->prepare('SELECT xref_id FROM xref WHERE accession = ? AND source_id = ? AND species_id = ?'); + $get_xref_sth->execute($acc, $source_id, $species_id) or croak( $dbi->errstr() ); + if (my @row = $get_xref_sth->fetchrow_array()) { + $xref_id = $row[0]; + } + $get_xref_sth->finish(); + + if(defined $xref_id){ + return $xref_id; + } + + my $add_xref_sth = $dbi->prepare('INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type, info_text) VALUES(?,?,?,?,?,?,?,?)'); + + # If the description is more than 255 characters, chop it off + if (defined $description && ((length $description) > 255 )) { + my $truncmsg = ' /.../'; + substr $description, 255 - (length $truncmsg), length $truncmsg, $truncmsg; + } + + # Add the xref and croak if it fails + $add_xref_sth->execute($acc, $version || 0, $label, $description, $source_id, $species_id, $info_type, $info_text) + or croak("$acc\t$label\t\t$source_id\t$species_id\n"); + + $add_xref_sth->finish(); + + return $add_xref_sth->{'mysql_insertid'}; +} + +sub add_direct_xref { + my ($general_xref_id, $ensembl_stable_id, $ensembl_type, $linkage_type, $dbi) = @_; + + # Check if such a mapping exists yet + my @existing_xref_ids = get_direct_xref($ensembl_stable_id, $ensembl_type, $linkage_type, $dbi); + if (scalar grep { $_ == $general_xref_id } @existing_xref_ids) { + return; + } + + $ensembl_type = lc($ensembl_type); + my $add_direct_xref_sth = $dbi->prepare('INSERT INTO ' . $ensembl_type . '_direct_xref VALUES (?,?,?)'); + + $add_direct_xref_sth->execute($general_xref_id, $ensembl_stable_id, $linkage_type); + $add_direct_xref_sth->finish(); + + return; +} + +sub get_direct_xref{ + my ($stable_id, $type, $link, $dbi) = @_; + + $type = lc $type; + + my $sql = "SELECT general_xref_id FROM ${type}_direct_xref d WHERE ensembl_stable_id = ? AND linkage_xref"; + my @sql_params = ( $stable_id ); + if (defined $link) { + $sql .= '= ?'; + push @sql_params, $link; + } else { + $sql .= 'is null'; + } + my $direct_sth = $dbi->prepare($sql); + + $direct_sth->execute( @sql_params ) || croak( $dbi->errstr() ); + if (wantarray ()) { + # Generic behaviour + my @results; + + my $all_rows = $direct_sth->fetchall_arrayref(); + foreach my $row_ref ( @{ $all_rows } ) { + push @results, $row_ref->[0]; + } + + return @results; + } else { + # Backwards-compatible behaviour + if (my @row = $direct_sth->fetchrow_array()) { + return $row[0]; + } + } + $direct_sth->finish(); + + return; +} \ No newline at end of file diff --git a/src/python/ensembl/common/Params.py b/src/python/ensembl/common/Params.py index ef9371f99..b7a163a14 100644 --- a/src/python/ensembl/common/Params.py +++ b/src/python/ensembl/common/Params.py @@ -19,215 +19,233 @@ import json import argparse +from typing import Dict, Any + sys.tracebacklimit = 0 + class Params: - def __init__(self, params: dict=None, parse_dataflow_json: bool=True) -> None: - """ - Parameters - ---------- - params: dict, optional - The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) - parse_dataflow_json: bool, optional - Specifies whether to parse an option called 'dataflow' in the provided options (default is True) - """ - if params: - self._params = params - else: - self._params = {} - self.parse_argv_params(parse_dataflow_json) - - def parse_argv_params(self, parse_dataflow_json: bool=True): - """Parses command-line arguments and extracts them into the Params object. - Command-line arguments need to be passed in the format "--name value". - - Parameters - ---------- - parse_dataflow_json: bool, optional - Specifies whether to parse an option called 'dataflow' in the provided options (default is True) - """ - args = sys.argv[1:] - - # Extract param names from command line - r = re.compile(r"^--") - param_names = list(filter(r.match, args)) - - parser = argparse.ArgumentParser() - for name in param_names: - parser.add_argument(name) - - params = parser.parse_args() - for param_name in vars(params): - if param_name == 'dataflow' and parse_dataflow_json: - dataflow_params = json.loads(getattr(params, param_name)) - for name,value in dataflow_params.items(): - self.param(name, value) - else: - self.param(param_name, getattr(params, param_name)) - - def param(self, name: str, new_value=None, options: dict={}): - """ Gets or sets a parameter value. - - Parameters - ---------- - name: str - The name of the paramater - new_value: any, optional - The value to set the parameter to (default is None) - options: dict, optional - Extra options, including: - - default: The default value to use if parameter has no value (sets the parameter value to this) - - type: The type of the parameter value, used to check if value is valid - - Returns - ------- - The value of the parameter with provided name. - - Raises - ------ - AttributeError - If no parameter name was passed. - """ - if not name: - raise AttributeError('You must supply a parameter name') - - value = None - - if new_value is not None: - self._params[name] = new_value - value = new_value - else: - value = self._params.get(name) - if value is None and options.get('default') is not None: - default = options['default'] - self._params[name] = default - value = default - - if options.get('type'): - return self.check_type(name, value, options['type']) - - return value - - def param_required(self, name: str, options: dict={}): - """ Gets a parameter value, raising an error if no value is found. - - Parameters - ---------- - name: str - The name of th parameter - options: dict, optional - Extra options, including: - - default: The default value to use if parameter has no value (sets the parameter value to this) - - type: The type of the parameter value, used to check if value is valid - - Returns - ------- - The value of the parameter with provided name. - - Raises - ------ - AttributeError - If no value is found for the required paramater. - """ - value = self.param(name, None, options) - - if value is None: - raise AttributeError(f'Parameter \'{name}\' is required but has no value') - - return value - - def check_type(self, name: str, value, value_type: str): - """ Checks if the parameter value provided is valid. - For specific types, this function can change the parameter value. - - Parameters - ---------- - name: str - The name of the parameter - value: any - The value of the parameter - value_type: str - The type of the parameter value. Accepted types: - - hash, dict, or dictionary - - array or list - - int or integer - - bool or boolean - - str or string - - Returns - ------- - None if no value is found, or the new value of the parameter with provided name. - - Raises - ------ - AttributeError - If no parameter name is provided. - If parameter value is not valid. - """ - if not name: - raise AttributeError('You must supply a parameter name') - if value is None: - return - - value_type = value_type.lower() - error = 0 - new_value = None - - if value_type in ['hash', 'dict', 'dictionary'] and not isinstance(value, dict): - error = 1 - elif value_type in ['array', 'list'] and not isinstance(value, list): - # Try to split by commas - if re.search(",", value): - new_value = value.split(",") - else: - new_value = [value] - elif value_type in ['integer', 'int'] and not isinstance(value, int): - # Try to make it an integer - try: - new_value = int(value) - except ValueError: - error = 1 - elif value_type in ['bool', 'boolean'] and not isinstance(value, bool): - # Try to make it a boolean - if isinstance(value, int): - new_value = bool(value) - elif value in ['0', '1']: - new_value = bool(int(value)) - else: - error = 1 - elif value_type in ['str', 'string'] and not isinstance(value, str): - new_value = str(value) - - if error: - raise AttributeError(f'Parameter \'{name}\' has an invalid value \'{value}\'. Must be of type {value_type}') - - self.param(name, new_value) - return new_value - - def write_output(self, suffix: str, params: dict): - """ Appends data to the dataflow json file (passed into next pipeline process). - - Parameters - ---------- - suffix: str - The file suffix to add to the output file name (dataflow_[suffix].json) - params: dict - The data to append into the file - """ - # Remove null params - params = {k: v for k, v in params.items() if v is not None} - - with open(f'dataflow_{suffix}.json', 'a') as fh: - json.dump(params, fh) - fh.write("\n") - - def write_all_output(self, suffix: str): - """ Appends all of the parameters in the object into the dataflow json file. - This calls the write_output function. - - Parameters - ---------- - suffix: str - The file suffix to add to the output file name (dataflow_[suffix].json) - """ - self.write_output(suffix, self._params) + def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None: + """Params constructor. + + Parameters + ---------- + params: dict, optional + The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) + parse_dataflow_json: bool, optional + Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + """ + if params is None: + params = {} + + if params: + self._params = params + else: + self._params = {} + self.parse_argv_params(parse_dataflow_json) + + def parse_argv_params(self, parse_dataflow_json: bool = True) -> None: + """Parses command-line arguments and extracts them into the Params object. + Command-line arguments need to be passed in the format "--name value". + + Parameters + ---------- + parse_dataflow_json: bool, optional + Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + """ + args = sys.argv[1:] + + # Extract param names from command line + r = re.compile(r"^--") + param_names = list(filter(r.match, args)) + + parser = argparse.ArgumentParser() + for name in param_names: + parser.add_argument(name) + + params = parser.parse_args() + for param_name in vars(params): + if param_name == "dataflow" and parse_dataflow_json: + dataflow_params = json.loads(getattr(params, param_name)) + for name, value in dataflow_params.items(): + self.param(name, value) + else: + self.param(param_name, getattr(params, param_name)) + + def param(self, name: str, new_value: Any = None, options: Dict[str, Any] = None) -> Any: + """Gets or sets a parameter value. + + Parameters + ---------- + name: str + The name of the paramater + new_value: any, optional + The value to set the parameter to (default is None) + options: dict, optional + Extra options, including: + - default: The default value to use if parameter has no value (sets the parameter value to this) + - type: The type of the parameter value, used to check if value is valid + + Returns + ------- + The value of the parameter with provided name. + + Raises + ------ + AttributeError + If no parameter name was passed. + """ + if not name: + raise AttributeError("You must supply a parameter name") + if options is None: + options = {} + + value = None + + if new_value is not None: + self._params[name] = new_value + value = new_value + else: + value = self._params.get(name) + if value is None and options.get("default") is not None: + default = options["default"] + self._params[name] = default + value = default + + if options.get("type"): + return self.check_type(name, value, options["type"]) + + return value + + def param_required(self, name: str, options: Dict[str, Any] = None) -> Any: + """Gets a parameter value, raising an error if no value is found. + + Parameters + ---------- + name: str + The name of th parameter + options: dict, optional + Extra options, including: + - default: The default value to use if parameter has no value (sets the parameter value to this) + - type: The type of the parameter value, used to check if value is valid + + Returns + ------- + The value of the parameter with provided name. + + Raises + ------ + AttributeError + If no value is found for the required paramater. + """ + value = self.param(name, None, options) + + if value is None: + raise AttributeError(f"Parameter '{name}' is required but has no value") + + return value + + def check_type(self, name: str, value: Any, value_type: str) -> Any: + """Checks if the parameter value provided is valid. + For specific types, this function can change the parameter value. + + Parameters + ---------- + name: str + The name of the parameter + value: any + The value of the parameter + value_type: str + The type of the parameter value. Accepted types: + - hash, dict, or dictionary + - array or list + - int or integer + - bool or boolean + - str or string + + Returns + ------- + None if no value is found, or the new value of the parameter with provided name. + + Raises + ------ + AttributeError + If no parameter name is provided. + If parameter value is not valid. + """ + if not name: + raise AttributeError("You must supply a parameter name") + if value is None: + return + + value_type = value_type.lower() + error, update = False, True + new_value = None + + if value_type in ["hash", "dict", "dictionary"] and not isinstance(value, dict): + error = True + elif value_type in ["array", "list"] and not isinstance(value, list): + # Try to split by commas + if re.search(",", value): + new_value = value.split(",") + else: + new_value = [value] + elif value_type in ["int", "integer"] and not isinstance(value, int): + # Try to make it an integer + try: + new_value = int(value) + except ValueError: + error = True + elif value_type in ["bool", "boolean"] and not isinstance(value, bool): + # Try to make it a boolean + if isinstance(value, int): + new_value = bool(value) + elif isinstance(value, str) and value in ["True", "False"]: + new_value = bool(value) + elif value in ["0", "1", 0, 1]: + new_value = bool(int(value)) + else: + error = True + elif value_type in ["str", "string"] and not isinstance(value, str): + new_value = str(value) + else: + update = False + + if error: + raise AttributeError( + f"Parameter '{name}' has an invalid value '{value}'. Must be of type {value_type}" + ) + + if update: + self.param(name, new_value) + value = new_value + + return value + + def write_output(self, suffix: str, params: Dict[str, Any]) -> None: + """Appends data to the dataflow json file (passed into next pipeline process). + + Parameters + ---------- + suffix: str + The file suffix to add to the output file name (dataflow_[suffix].json) + params: dict + The data to append into the file + """ + # Remove null params + params = {k: v for k, v in params.items() if v is not None} + + with open(f"dataflow_{suffix}.json", "a") as fh: + json.dump(params, fh) + fh.write("\n") + + def write_all_output(self, suffix: str) -> None: + """Appends all of the parameters in the object into the dataflow json file. + This calls the write_output function. + + Parameters + ---------- + suffix: str + The file suffix to add to the output file name (dataflow_[suffix].json) + """ + self.write_output(suffix, self._params) diff --git a/src/python/ensembl/common/__init__.py b/src/python/ensembl/common/__init__.py new file mode 100644 index 000000000..e68076fa5 --- /dev/null +++ b/src/python/ensembl/common/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common modules.""" diff --git a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py new file mode 100644 index 000000000..a869c1266 --- /dev/null +++ b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py @@ -0,0 +1,39 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref module to print out advisory datachecks results (only needed now since we are still using perl datachecks).""" + +from ensembl.production.xrefs.Base import * + + +class AdvisoryXrefReport(Base): + def run(self): + base_path = self.param_required("base_path", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + datacheck_name = self.param("datacheck_name", None, {"type": "str"}) + datacheck_output = self.param("datacheck_output", None, {"type": "str"}) + + # Create or locate report file + report_file = self.get_path( + base_path, species_name, release, "dc_report", f"{datacheck_name}.log" + ) + + # Return the quotation marks into the output + datacheck_output = re.sub("__", "'", datacheck_output) + + # Write datacheck result into file + with open(report_file, "a") as fh: + fh.write(datacheck_output) + fh.write("\n") diff --git a/src/python/ensembl/production/xrefs/Alignment.py b/src/python/ensembl/production/xrefs/Alignment.py new file mode 100644 index 000000000..b8ee417a1 --- /dev/null +++ b/src/python/ensembl/production/xrefs/Alignment.py @@ -0,0 +1,91 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Alignment module to map xref sequences into ensEMBL ones.""" + +from ensembl.production.xrefs.Base import * + + +class Alignment(Base): + def run(self): + base_path = self.param_required("base_path", {"type": "str"}) + method = self.param_required("align_method", {"type": "str"}) + query_cutoff = self.param_required("query_cutoff", {"type": "int"}) + target_cutoff = self.param_required("target_cutoff", {"type": "int"}) + max_chunks = self.param_required("max_chunks", {"type": "int"}) + chunk = self.param_required("chunk", {"type": "int"}) + job_index = self.param_required("job_index", {"type": "int"}) + source = self.param_required("source_file", {"type": "str"}) + target = self.param_required("target_file", {"type": "str"}) + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + map_file = self.param_required("map_file", {"type": "str"}) + source_id = self.param_required("source_id", {"type": "int"}) + seq_type = self.param_required("seq_type", {"type": "str"}) + + # Construct Exonerate command + ryo = "xref:%qi:%ti:%ei:%ql:%tl:%qab:%qae:%tab:%tae:%C:%s\n" + exe = ( + subprocess.check_output("which exonerate", shell=True) + .decode("utf-8") + .strip() + ) + command_string = f"{exe} --showalignment FALSE --showvulgar FALSE --ryo '{ryo}' --gappedextension FALSE --model 'affine:local' {method} --subopt no --query {source} --target {target} --querychunktotal {max_chunks} --querychunkid {chunk}" + + # Get exonerate hits + output = subprocess.run(command_string, shell=True, stdout=subprocess.PIPE) + + exit_code = abs(output.returncode) + if exit_code == 0: + hits = output.stdout.decode("utf-8").split("\n") + + # Write to mapping file + map_fh = open(map_file, "w") + for hit in hits: + if re.search(r"^xref", hit): + map_fh.write(f"{hit}\n") + map_fh.close() + elif exit_code == 9: + raise MemoryError( + f"Exonerate failed due to insufficient memory (exit code: {exit_code})" + ) + elif exit_code == 256: + raise SyntaxError( + f"Exonerate failed due to unexpected character(s) in files (exit code: {exit_code})" + ) + else: + raise Exception(f"Exonerate failed with exit_code: {output.returncode}") + + # Add job and mapping data into db + db_engine = self.get_db_engine(xref_db_url) + with db_engine.connect() as xref_dbi: + out_file = f"xref_{seq_type}.{max_chunks}-{chunk}.out" + job_id = f"{source_id}{job_index}{chunk}" + xref_dbi.execute( + insert(MappingJobsORM).values( + map_file=map_file, + status="SUBMITTED", + out_file=out_file, + err_file=out_file, + array_number=chunk, + job_id=job_id, + ) + ) + xref_dbi.execute( + insert(MappingORM).values( + job_id=job_id, + method=seq_type, + percent_query_cutoff=query_cutoff, + percent_target_cutoff=target_cutoff, + ) + ) diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py index d5022627f..3a59abfc0 100644 --- a/src/python/ensembl/production/xrefs/Base.py +++ b/src/python/ensembl/production/xrefs/Base.py @@ -29,10 +29,11 @@ import random import csv import subprocess +import unicodedata -from sqlalchemy import create_engine, select, insert, update, text, func, and_ +from sqlalchemy import create_engine, select, insert, update, text, func, and_, delete from sqlalchemy.engine.url import make_url, URL -from sqlalchemy.engine import Connection +from sqlalchemy.engine import Engine, Connection from sqlalchemy.orm import aliased from sqlalchemy_utils import database_exists, create_database, drop_database from urllib.parse import urlparse @@ -40,822 +41,973 @@ from itertools import groupby from configparser import ConfigParser from datetime import datetime - -from ensembl.xrefs.xref_source_db_model import Base as XrefSourceDB, Source as SourceSORM, Version as VersionORM, ChecksumXref as ChecksumXrefSORM - -from ensembl.xrefs.xref_update_db_model import Base as XrefUpdateDB, Source as SourceUORM, SourceURL as SourceURLORM, Xref as XrefUORM, \ - PrimaryXref as PrimaryXrefORM, DependentXref as DependentXrefUORM, GeneDirectXref as GeneDirectXrefORM, TranscriptDirectXref as TranscriptDirectXrefORM, \ - TranslationDirectXref as TranslationDirectXrefORM, Synonym as SynonymORM, Pairs as PairsORM, Species as SpeciesORM, \ - SourceMappingMethod as SourceMappingMethodORM, MappingJobs as MappingJobsORM, Mapping as MappingORM - -from ensembl.core.models import Meta as MetaCORM, Gene as GeneORM, Transcript as TranscriptORM, Analysis as AnalysisORM, \ - ExonTranscript as ExonTranscriptORM, SupportingFeature as SupportingFeatureORM, DnaAlignFeature as DnaAlignFeatureORM, \ - TranscriptAttrib as TranscriptAttribORM, AttribType as AttribTypeORM, AnalysisDescription as AnalysisDescriptionORM, \ - SeqRegion as SeqRegionORM, SeqRegionAttrib as SeqRegionAttribORM, CoordSystem as CoordSystemORM, Translation as TranslationORM, \ - Exon as ExonORM, Xref as XrefCORM, DependentXref as DependentXrefCORM, ExternalDb as ExternalDbORM, Dna as DnaORM, ObjectXref as ObjectXrefCORM +from pyspark import SparkConf +from pyspark.sql import SparkSession +from typing import IO, List, Dict, Any, Iterator, Optional + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper + +from ensembl.xrefs.xref_source_db_model import ( + Base as XrefSourceDB, + Source as SourceSORM, + Version as VersionORM, + ChecksumXref as ChecksumXrefSORM, +) + +from ensembl.xrefs.xref_update_db_model import ( + Base as XrefUpdateDB, + Source as SourceUORM, + SourceURL as SourceURLORM, + Xref as XrefUORM, + PrimaryXref as PrimaryXrefORM, + DependentXref as DependentXrefUORM, + CoordinateXref as CoordinateXrefORM, + GeneDirectXref as GeneDirectXrefORM, + TranscriptDirectXref as TranscriptDirectXrefORM, + TranslationDirectXref as TranslationDirectXrefORM, + Synonym as SynonymORM, + Pairs as PairsORM, + Species as SpeciesORM, + MappingJobs as MappingJobsORM, + Mapping as MappingORM, +) + +from ensembl.core.models import ( + Meta as MetaCORM, + Analysis as AnalysisORM, + AnalysisDescription as AnalysisDescriptionORM, + SeqRegion as SeqRegionORM, + CoordSystem as CoordSystemORM, + Dna as DnaORM, + Gene as GeneORM, + Transcript as TranscriptORM, + Translation as TranslationORM, + Exon as ExonORM, + ExonTranscript as ExonTranscriptORM, + SupportingFeature as SupportingFeatureORM, + DnaAlignFeature as DnaAlignFeatureORM, + AttribType as AttribTypeORM, + TranscriptAttrib as TranscriptAttribORM, + SeqRegionAttrib as SeqRegionAttribORM, + Xref as XrefCORM, + DependentXref as DependentXrefCORM, + ExternalDb as ExternalDbORM, + ObjectXref as ObjectXrefCORM, +) from ensembl.common.Params import Params + class Base(Params): - """ Class to represent the base of xref modules. Inherits the Params class. - """ - def __init__(self, params: dict=None, parse_dataflow_json: bool=True) -> None: - """ Calls the parent __init__ then sets some specific parameters. - - Parameters - ---------- - params: dict, optional - The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) - parse_dataflow_json: bool, optional - Specifies whether to parse an option called 'dataflow' in the provided options (default is True) - """ - super().__init__(params, parse_dataflow_json) - - self.param('metasearch_url', "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch") - - # Initialize the logfile for this run - if self.param('log_timestamp'): - current_timestamp = self.param('log_timestamp') - else: - current_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - - log_path = os.path.join(self.param_required('base_path'), 'logs', current_timestamp) - if not os.path.exists(log_path): os.makedirs(log_path, exist_ok = True) - - log_file = os.path.join(log_path, 'tmp_logfile_'+self.__class__.__name__+'_'+str(random.randint(0, 5000))) - self._log_file = log_file - - console_handler = logging.StreamHandler() - file_handler = logging.FileHandler(log_file, mode='a') - console_handler.setLevel(logging.WARNING) - file_handler.setLevel(logging.DEBUG) - - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s | %(levelname)s | %(message)s', - datefmt='%d-%b-%Y %H:%M:%S', - handlers=[console_handler, file_handler] - ) - - def create_source_db(self, source_url: str, reuse_db_if_present: bool): - """ Creates the xref source database from model. - - Parameters - ---------- - source_url: str - The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] - reuse_db_if_present: bool - If set to False, the database defined by provided URL will be dropped before creating a new one - """ - url = make_url(source_url) - engine = create_engine(url, isolation_level="AUTOCOMMIT") - - if url.database and reuse_db_if_present: - return - - if database_exists(engine.url): - drop_database(engine.url) - create_database(engine.url) - XrefSourceDB.metadata.create_all(engine) - - def download_file(self, file: str, base_path: str, source_name: str, extra_args: dict): - """ Downloads an xref file and saves into provided space. - - Parameters - ---------- - file: str - The URL of the file to download. Acceptable URL schemes: ftp, http, and https - base_path: str - The path to save the downloaded file into - source_name: str - The xref source name - extra_args: dict - Extra options, including: - - skip_download_if_file_present: If set to True, file is only downloaded if does not exist - - db: The type of external db for the xref source (only relevent here if equal to 'checksum') - - release: If set to 'version', then this is a version file download - - rel_number: The URL used to retrieve the release number (only for RefSeq) - - catalog: The URL used to retrieve the release catalog (only for RefSeq) - - Returns - ------- - The path of the downloaded file. - - Raises - ------ - LookupError - If rel_number is provided but no release number was found in URL. - AttributeError - If file URL scheme is invalid. - """ - # Create uri object and get scheme - uri = urlparse(file) - if not uri.scheme: - return file - - # Get extra parameters - skip_download_if_file_present = extra_args.get('skip_download_if_file_present') or False - db = extra_args.get('db') - release = extra_args.get('release') - rel_number = extra_args.get('rel_number') - catalog = extra_args.get('catalog') - - # Create file download path - orig_source_name = source_name - source_name = re.sub(r"\/", "", source_name) - dest_dir = os.path.join(base_path, source_name) - if db and db == 'checksum': - dest_dir = os.path.join(base_path, 'Checksum') - if not os.path.exists(dest_dir): os.makedirs(dest_dir, exist_ok = True) - - file_path = "" - - # If file is in local ftp, copy from there - if re.search("ftp.ebi.ac.uk", file): - # Construct local path - local_file = file - local_file = re.sub("https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file) - - # Check if local file exists - if os.path.exists(local_file): - file_path = os.path.join(dest_dir, os.path.basename(uri.path)) - if db and db == 'checksum': - file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}') - - if not (skip_download_if_file_present and os.path.exists(file_path)): - shutil.copy(local_file, file_path) - - # Check if copy was successful - if os.path.exists(file_path): - logging.info(f'{orig_source_name} file copied from local FTP: {file_path}') - if release: - return file_path + """Class to represent the base of xref modules. Inherits the Params class.""" + + def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None: + """Calls the parent __init__ then sets some specific parameters. + + Parameters + ---------- + params: dict, optional + The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) + parse_dataflow_json: bool, optional + Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + """ + super().__init__(params, parse_dataflow_json) + + self.param( + "metasearch_url", "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch" + ) + + # Initialize the logfile for this run (except for the Alignment module) + module_name = self.__class__.__name__ + if module_name != "Alignment": + if self.param("log_timestamp"): + current_timestamp = self.param("log_timestamp") + else: + current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + log_path = os.path.join( + self.param_required("base_path"), "logs", current_timestamp + ) + if not os.path.exists(log_path): + os.makedirs(log_path, exist_ok=True) + + log_file = os.path.join( + log_path, + "tmp_logfile_" + module_name + "_" + str(random.randint(0, 5000)), + ) + self._log_file = log_file + + console_handler = logging.StreamHandler() + file_handler = logging.FileHandler(log_file, mode="a") + console_handler.setLevel(logging.WARNING) + file_handler.setLevel(logging.DEBUG) + + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s | %(levelname)s | %(message)s", + datefmt="%d-%b-%Y %H:%M:%S", + handlers=[console_handler, file_handler], + ) + + def create_source_db(self, source_url: str, reuse_db_if_present: bool) -> None: + """Creates the xref source database from model. + + Parameters + ---------- + source_url: str + The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] + reuse_db_if_present: bool + If set to False, the database defined by provided URL will be dropped before creating a new one + """ + url = make_url(source_url) + engine = create_engine(url, isolation_level="AUTOCOMMIT") + + if url.database and reuse_db_if_present: + return + + if database_exists(engine.url): + drop_database(engine.url) + create_database(engine.url) + XrefSourceDB.metadata.create_all(engine) + + def download_file(self, file: str, base_path: str, source_name: str, extra_args: Dict[str, Any]) -> str: + """Downloads an xref file and saves into provided space. + + Parameters + ---------- + file: str + The URL of the file to download. Acceptable URL schemes: ftp, http, and https + base_path: str + The path to save the downloaded file into + source_name: str + The xref source name + extra_args: dict + Extra options, including: + - skip_download_if_file_present: If set to True, file is only downloaded if does not exist + - db: The type of external db for the xref source (only relevent here if equal to 'checksum') + - release: If set to 'version', then this is a version file download + - rel_number: The URL used to retrieve the release number (only for RefSeq) + - catalog: The URL used to retrieve the release catalog (only for RefSeq) + + Returns + ------- + The path of the downloaded file. + + Raises + ------ + LookupError + If rel_number is provided but no release number was found in URL. + AttributeError + If file URL scheme is invalid. + """ + # Create uri object and get scheme + uri = urlparse(file) + if not uri.scheme: + return file + + # Get extra parameters + skip_download_if_file_present = ( + extra_args.get("skip_download_if_file_present") or False + ) + db = extra_args.get("db") + release = extra_args.get("release") + rel_number = extra_args.get("rel_number") + catalog = extra_args.get("catalog") + + # Create file download path + orig_source_name = source_name + source_name = re.sub(r"\/", "", source_name) + dest_dir = os.path.join(base_path, source_name) + if db and db == "checksum": + dest_dir = os.path.join(base_path, "Checksum") + if not os.path.exists(dest_dir): + os.makedirs(dest_dir, exist_ok=True) + + file_path = "" + + # If file is in local ftp, copy from there + if re.search("ftp.ebi.ac.uk", file): + # Construct local path + local_file = file + local_file = re.sub( + "https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file + ) + + # Check if local file exists + if os.path.exists(local_file): + file_path = os.path.join(dest_dir, os.path.basename(uri.path)) + if db and db == "checksum": + file_path = os.path.join( + dest_dir, f"{source_name}-{os.path.basename(uri.path)}" + ) + + if not (skip_download_if_file_present and os.path.exists(file_path)): + shutil.copy(local_file, file_path) + + # Check if copy was successful + if os.path.exists(file_path): + logging.info( + f"{orig_source_name} file copied from local FTP: {file_path}" + ) + # if release: + # return file_path + # return os.path.dirname(file_path) + return file_path + else: + logging.info( + f"{orig_source_name} file already exists, skipping download ({file_path})" + ) + + # Handle Refseq files + if re.search("RefSeq", source_name) and rel_number and catalog and not release: + # Get current release number + release_number = requests.get(rel_number).json() + if not release_number: + raise LookupError(f"No release number in {rel_number}") + + # Get list of files in release catalog + catalog = re.sub(r"\*", str(release_number), catalog) + files_list = requests.get(catalog).text + refseq_files = files_list.split("\n") + files_to_download = [] + + # Download each refseq file + for refseq_file in refseq_files: + if not refseq_file: + continue + checksum, filename = refseq_file.split("\t") + + # Only interested in files matching pattern + if not fnmatch.fnmatch(filename, os.path.basename(uri.path)): + continue + if re.search("nonredundant_protein", filename) or re.search( + "wp_protein", filename + ): + continue + + file_path = os.path.join(dest_dir, os.path.basename(filename)) + if os.path.exists(file_path): + if skip_download_if_file_present: + logging.info( + f"{orig_source_name} file already exists, skipping download ({file_path})" + ) + continue + os.remove(file_path) + + file_url = os.path.join(os.path.dirname(file), filename) + files_to_download.append({"url": file_url, "path": file_path}) + logging.info( + f"{orig_source_name} file downloaded via HTTP: {file_path}" + ) + + self.refseq_multithreading(files_to_download) + elif uri.scheme == "ftp": + ftp = FTP(uri.netloc) + ftp.login("anonymous", "-anonymous@") + ftp.cwd(os.path.dirname(uri.path)) + remote_files = ftp.nlst() + + # Download files in ftp server + for remote_file in remote_files: + # Only interested in files matching pattern + if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)): + continue + + remote_file = re.sub(r"\n", "", remote_file) + file_path = os.path.join(dest_dir, os.path.basename(remote_file)) + if db and db == "checksum": + file_path = os.path.join( + dest_dir, f"{source_name}-{os.path.basename(remote_file)}" + ) + + if not (skip_download_if_file_present and os.path.exists(file_path)): + ftp.retrbinary("RETR " + remote_file, open(file_path, "wb").write) + logging.info( + f"{orig_source_name} file downloaded via FTP: {file_path}" + ) + else: + logging.info( + f"{orig_source_name} file already exists, skipping download ({file_path})" + ) + ftp.close() + elif uri.scheme == "http" or uri.scheme == "https": + # This is the case for the release file + if re.search("RefSeq", source_name) and rel_number and release: + # Get current release number + release_number = requests.get(rel_number).json() + if not release_number: + raise LookupError(f"No release number in {rel_number}") + + file = re.sub(r"\*", str(release_number), file) + uri = urlparse(file) + + file_path = os.path.join(dest_dir, os.path.basename(uri.path)) + if db and db == "checksum": + file_path = os.path.join( + dest_dir, f"{source_name}-{os.path.basename(uri.path)}" + ) + + if not os.path.exists(file_path) or not skip_download_if_file_present: + if not skip_download_if_file_present and os.path.exists(file_path): + os.remove(file_path) + wget.download(file, file_path) + logging.info( + f"{orig_source_name} file downloaded via HTTP: {file_path}" + ) + else: + logging.info( + f"{orig_source_name} file already exists, skipping download ({file_path})" + ) + else: + raise AttributeError(f"Invalid URL scheme {uri.scheme}") + + # if release: + # return file_path + # return os.path.dirname(file_path) + if re.search("RefSeq", source_name) and not release: return os.path.dirname(file_path) + return file_path + + def refseq_multithreading(self, files: List[str]) -> None: + """Creates multiple threads to download RefSeq files in parallel. + + Parameters + ---------- + files: list + The list of file URLs and paths to download. + """ + number_of_threads = 20 + chunk_size = int(len(files) / number_of_threads) + threads = [] + + for thread_index in range(number_of_threads): + array_start = thread_index * chunk_size + array_end = ( + len(files) + if thread_index + 1 == number_of_threads + else (thread_index + 1) * chunk_size + ) + + thread = threading.Thread( + target=self.download_refseq_files, args=(files, array_start, array_end) + ) + threads.append(thread) + threads[thread_index].start() + + for thread in threads: + thread.join() + + def download_refseq_files(self, files: List[str], start: int, end: int) -> None: + """Downloads RefSeq files from a subset of files. + + Parameters + ---------- + files: list + The list of file URLs and paths to download. + start: int + The start index of the files list. + end: int + The end index of the files list. + + Raises + ------ + Exception + If file download fails all attempts. + """ + for index in range(start, end): + failed = 0 + file_url = files[index]["url"] + local_path = files[index]["path"] + + for retry in range(0, 3): + try: + wget.download(file_url, local_path) + except: + failed += 1 + continue + break + + if failed > 0: + raise BufferError(f"Failed to download file {file_url}") + + def get_dbi(self, url: str) -> Connection: + """Returns a DB connection for a provided URL. + + Parameters + ---------- + url: str + The database URL to connect to + + Returns + ------- + An sqlalchemy engine connection. + """ + connect_url = make_url(url) + engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") + + return engine.connect() + + def get_db_engine(self, url: str) -> Engine: + """Returns a DB engine for a provided URL. + + Parameters + ---------- + url: str + The database URL to create an engine for + + Returns + ------- + An sqlalchemy engine. + """ + connect_url = make_url(url) + engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") + + return engine + + def load_checksum(self, path: str, url: str) -> None: + """Loads the xref checksum files into a provided database. + This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB. + + Parameters + ---------- + path: str + The path where the checksum files can be found + url: str + The database URL to load the checksum data into + """ + checksum_dir = os.path.join(path, "Checksum") + if not os.path.exists(checksum_dir): + os.makedirs(checksum_dir, exist_ok=True) + + output_files = [] + threshold = 50000000 + counter = 1 + source_id = 1 + output_fh = None + + # Connect to db + url = url + "?local_infile=1" + db_engine = self.get_db_engine(url) + with db_engine.connect() as dbi: + # Get all checksum files + files = os.listdir(checksum_dir) + + # Go through all available checksum files + index = 0 + for checksum_file in files: + if re.search("checksum", checksum_file): + continue + + # Get the source name and ID + input_file = os.path.join(checksum_dir, checksum_file) + match = re.search(r"\/([A-Za-z]*)-.*$", input_file) + source_name = match.group(1) + source_id = self.get_source_id_from_name(dbi, source_name) + + # Open the input file + input_fh = self.get_filehandle(input_file) + for line in input_fh: + # Open the output file + if not output_fh or (counter % threshold) == 0: + if output_fh: + output_fh.close() + index += 1 + output_file = os.path.join( + checksum_dir, f"checksum_{index}.txt" + ) + output_files.append(output_file) + output_fh = open(output_file, "w") + + line = line.rstrip() + (checksum_id, checksum) = re.split(r"\s+", line) + + output = [str(counter), str(source_id), checksum_id, checksum] + output_str = "\t".join(output) + output_fh.write(f"{output_str}\n") + + counter += 1 + + input_fh.close() + + if output_fh: + output_fh.close() + + # Add the data in the files to the db + for output_file in output_files: + dbi.execute( + text( + f"load data local infile '{output_file}' into table checksum_xref" + ) + ) + + # Merge the created files + merged_file = os.path.join(checksum_dir, f"checksum.txt") + with open(merged_file, "w") as output_fh: + for output_file in output_files: + with open(output_file, "r") as input_fh: + shutil.copyfileobj(input_fh, output_fh) + os.remove(output_file) + + def get_filehandle(self, filename: str) -> IO: + """Opens an appropriate read filehandle for a file based on its type. + + Parameters + ---------- + filename: str + The name and path of the file to read + + Returns + ------- + A read filehandle. + + Raises + ------ + FileNotFoundError + If no file name was provided. + If provided file could not be found. + """ + if not filename or filename == "": + raise FileNotFoundError("No file name") + + alt_filename = filename + alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename) + if alt_filename == filename: + alt_filename = alt_filename + ".gz" + + if not os.path.exists(filename): + if not os.path.exists(alt_filename): + raise FileNotFoundError( + f"Could not find either {filename} or {alt_filename}" + ) + filename = alt_filename + + if re.search(r"\.(gz|Z)$", filename): + fh = gzip.open(filename, "rt") else: - logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') - - # Handle Refseq files - if re.search("RefSeq", source_name) and rel_number and catalog and not release: - # Get current release number - release_number = requests.get(rel_number).json() - if not release_number: - raise LookupError(f'No release number in {rel_number}') - - # Get list of files in release catalog - catalog = re.sub(r"\*", str(release_number), catalog) - files_list = requests.get(catalog).text - refseq_files = files_list.split("\n") - files_to_download = [] - - # Download each refseq file - for refseq_file in refseq_files: - if not refseq_file: continue - checksum, filename = refseq_file.split("\t") - - # Only interested in files matching pattern - if not fnmatch.fnmatch(filename, os.path.basename(uri.path)): continue - if re.search("nonredundant_protein", filename) or re.search("wp_protein", filename): continue - - file_path = os.path.join(dest_dir, os.path.basename(filename)) - if os.path.exists(file_path): - if skip_download_if_file_present: - logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') - continue - os.remove(file_path) - - file_url = os.path.join(os.path.dirname(file), filename) - files_to_download.append({'url': file_url, 'path': file_path}) - logging.info(f'{orig_source_name} file downloaded via HTTP: {file_path}') - - self.refseq_multithreading(files_to_download) - elif uri.scheme == 'ftp': - ftp = FTP(uri.netloc) - ftp.login('anonymous', '-anonymous@') - ftp.cwd(os.path.dirname(uri.path)) - remote_files = ftp.nlst() - - # Download files in ftp server - for remote_file in remote_files: - # Only interested in files matching pattern - if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)): continue - - remote_file = re.sub(r"\n", "", remote_file) - file_path = os.path.join(dest_dir, os.path.basename(remote_file)) - if db and db == 'checksum': - file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(remote_file)}') - - if not (skip_download_if_file_present and os.path.exists(file_path)): - ftp.retrbinary("RETR " + remote_file , open(file_path, 'wb').write) - logging.info(f'{orig_source_name} file downloaded via FTP: {file_path}') + fh = open(filename, "r") + + return fh + + def get_source_id_from_name(self, dbi: Connection, source_name: str) -> int: + """Retrieves a source ID from its name from a database. + + Parameters + ---------- + dbi: db connection + The database connection to query in + source_name: str + The name of the source + + Returns + ------- + The source ID. + """ + source_id = dbi.execute( + select(SourceSORM.source_id).where(SourceSORM.name == source_name) + ).scalar() + + return source_id + + def get_file_sections(self, file: str, delimiter: str) -> Iterator[List[str]]: + """Reads a provided file by sections, separated by a provided delimiter. + This function uses 'yield' to provide the file sections one by one. + + Parameters + ---------- + file: str + The name and path of the file to read + delimiter: str + The character or string separating the file sections + + Returns + ------- + A yield of file sections. + """ + if re.search(r"\.(gz|Z)$", file): + with gzip.open(file, "rt") as fh: + groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) + for key, group in groups: + yield list(group) else: - logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') - ftp.close() - elif uri.scheme == 'http' or uri.scheme == 'https': - # This is the case for the release file - if re.search("RefSeq", source_name) and rel_number and release: - # Get current release number - release_number = requests.get(rel_number).json() - if not release_number: - raise LookupError(f'No release number in {rel_number}') - - file = re.sub(r"\*", str(release_number), file) - uri = urlparse(file) - - file_path = os.path.join(dest_dir, os.path.basename(uri.path)) - if db and db == 'checksum': - file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}') - - if not os.path.exists(file_path) or not skip_download_if_file_present: - if not skip_download_if_file_present and os.path.exists(file_path): - os.remove(file_path) - wget.download(file, file_path) - logging.info(f'{orig_source_name} file downloaded via HTTP: {file_path}') - else: - logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') - else: - raise AttributeError(f'Invalid URL scheme {uri.scheme}') - - if release: - return file_path - return os.path.dirname(file_path) - - def refseq_multithreading(self, files): - """ Creates multiple threads to download RefSeq files in parallel. - - Parameters - ---------- - files: list - The list of file URLs and paths to download. - """ - number_of_threads = 20 - chunk_size = int(len(files) / number_of_threads) - threads = [] - - for thread_index in range(number_of_threads): - array_start = thread_index * chunk_size - array_end = len(files) if thread_index+1 == number_of_threads else (thread_index+1) * chunk_size - - thread = threading.Thread(target=self.download_refseq_files, args=(files, array_start, array_end)) - threads.append(thread) - threads[thread_index].start() - - for thread in threads: - thread.join() - - def download_refseq_files(self, files, start: int, end: int): - """ Downloads RefSeq files from a subset of files. - - Parameters - ---------- - files: list - The list of file URLs and paths to download. - start: int - The start index of the files list. - end: int - The end index of the files list. - - Raises - ------ - Exception - If file download fails all attempts. - """ - for index in range(start, end): - failed = 0 - file_url = files[index]['url'] - local_path = files[index]['path'] - - for retry in range(0,3): - try: - wget.download(file_url, local_path) - except: - failed += 1 - continue - break - - if failed > 0: - raise Exception(f'Failed to download file {file_url}') - - def get_dbi(self, url: str): - """ Returns a DB connection for a provided URL. - - Parameters - ---------- - url: str - The database URL to connect to - - Returns - ------- - An sqlalchemy engine connection. - """ - connect_url = make_url(url) - engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") - - return engine.connect() - - def get_db_engine(self, url: str): - """ Returns a DB engine for a provided URL. - - Parameters - ---------- - url: str - The database URL to create an engine for - - Returns - ------- - An sqlalchemy engine. - """ - connect_url = make_url(url) - engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") - - return engine - - def load_checksum(self, path: str, url: str): - """ Loads the xref checksum files into a provided database. - This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB. - - Parameters - ---------- - path: str - The path where the checksum files can be found - url: str - The database URL to load the checksum data into - """ - checksum_dir = os.path.join(path, 'Checksum') - if not os.path.exists(checksum_dir): os.makedirs(checksum_dir, exist_ok = True) - - # Connect to db - url = url + "?local_infile=1" - db_engine = self.get_db_engine(url) - with db_engine.connect() as dbi: - counter = 1 - source_id = 1 - - # Open the checksum output file - files = os.listdir(checksum_dir) - checksum_file = os.path.join(checksum_dir, 'checksum.txt') - with open(checksum_file, 'w') as output_fh: - # Go through all available checksum files - for file in files: - if re.search("checksum", file): continue - - input_file = os.path.join(checksum_dir, file) - match = re.search(r"\/([A-Za-z]*)-.*$", input_file) - source_name = match.group(1) - source_id = self.get_source_id_from_name(dbi, source_name) - - input_fh = self.get_filehandle(input_file) - for line in input_fh: - line = line.rstrip() - (id, checksum) = re.split(r"\s+", line) - - counter += 1 - output = [str(counter), str(source_id), id, checksum] - output_str = "\t".join(output) - output_fh.write(f'{output_str}\n') - - input_fh.close() - - query = f'load data local infile \'{checksum_file}\' into table checksum_xref' - dbi.execute(text(query)) - - def get_filehandle(self, filename: str): - """ Opens an appropriate read filehandle for a file based on its type. - - Parameters - ---------- - filename: str - The name and path of the file to read - - Returns - ------- - A read filehandle. - - Raises - ------ - FileNotFoundError - If no file name was provided. - If provided file could not be found. - """ - if not filename or filename == '': - raise FileNotFoundError('No file name') - - alt_filename = filename - alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename) - if alt_filename == filename: - alt_filename = alt_filename + ".gz" - - if not os.path.exists(filename): - if not os.path.exists(alt_filename): - raise FileNotFoundError(f'Could not find either {filename} or {alt_filename}') - filename = alt_filename - - if re.search(r"\.(gz|Z)$", filename): - fh = gzip.open(filename, 'rt') - else: - fh = open(filename, 'r') - - return fh - - def get_source_id_from_name(self, dbi, source_name: str): - """ Retrieves a source ID from its name from a database. - - Parameters - ---------- - dbi: db connection - The database connection to query in - source_name: str - The name of the source - - Returns - ------- - The source ID. - """ - query = select(SourceSORM.source_id).where(SourceSORM.name==source_name) - source_id = dbi.execute(query).scalar() - - return source_id - - def get_file_sections(self, file: str, delimiter: str): - """ Reads a provided file by sections, separated by a provided delimiter. - This function uses 'yield' to provide the file sections one by one. - - Parameters - ---------- - file: str - The name and path of the file to read - delimiter: str - The character or string separating the file sections - - Returns - ------- - A yield of file sections. - """ - if re.search(r"\.(gz|Z)$", file): - with gzip.open(file, 'rt') as fh: - groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) - for key,group in groups: - yield list(group) - else: - with open(file, 'r') as fh: - groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) - for key,group in groups: - yield list(group) - - def create_xref_db(self, url: str, config_file: str, preparse:bool): - """ Creates the xref database from model. - This function always drops the database defined by the provided URL (if it exists) before creating a new one. - - Parameters - ---------- - url: str - The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] - config_file: str - The name and path of the .ini file that has information about xref sources and species - preparse: bool - Specifies whether source preparsing will be done or not - """ - engine = create_engine(url, isolation_level="AUTOCOMMIT") - - # Drop database and create again - if database_exists(engine.url): - drop_database(engine.url) - create_database(engine.url) - XrefUpdateDB.metadata.create_all(engine) - - xref_dbi = engine.connect() - self.populate_xref_db(xref_dbi, config_file, preparse) - - def populate_xref_db(self, dbi, config_file:str, preparse:bool): - """ Populates the xref database with configuration data. - - Parameters - ---------- - dbi: db connection - The xref database connection - config_file: str - The name and path of the .ini file that has information about xref sources and species to populate the database with - preparse: bool - Specifies whether source preparsing will be done or not (needed to decide if to use old parsers) - - Raises - ------ - KeyError - If a source exists in a species section in the configuration file, but has no source section of its own. - """ - source_ids = {} - source_parsers = {} - species_sources = {} - - config = ConfigParser() - config.read(config_file) - - species_sections, sources_sections = {}, {} - - for section_name in config.sections(): - section = config[section_name] - (keyword, name) = re.split(r"\s+", section_name) - - if keyword == 'source': - sources_sections[name] = section - elif keyword == 'species': - species_sections[name] = section - - # Parse species sections - for species_name, section in species_sections.items(): - taxonomy_ids = section.get('taxonomy_id').split(",") - sources = section.get('sources') - aliases = section.get('aliases', species_name) - - species_id = taxonomy_ids[0] - - for tax_id in taxonomy_ids: - # Add new species - query = insert(SpeciesORM).values(species_id=species_id, taxonomy_id=tax_id, name=species_name, aliases=aliases) - dbi.execute(query) - - species_sources[species_id] = sources - - source_id = 0 - # Parse source sections - for source_name, section in sorted(sources_sections.items()): - source_id += 1 - source_name = section.get('name') - order = section.get('order') - priority = section.get('priority') - priority_description = section.get('prio_descr', '') - status = section.get('status', 'NOIDEA') - - old_parser = section.get('old_parser') - if old_parser and not preparse: - parser = old_parser - else: - parser = section.get('parser') - - # Add new source - query = insert(SourceUORM).values(name=source_name, source_release='1', ordered=order, priority=priority, priority_description=priority_description, status=status) - dbi.execute(query) - - source_ids[source_name] = source_id - source_parsers[source_id] = parser - - # Add source url rows - for species_id, sources in species_sources.items(): - source_names = sources.split(",") - - for source_name in source_names: - if not source_ids.get(source_name): - raise KeyError(f'No source section found for {source_name} in config file') - - source_id = source_ids[source_name] - parser = source_parsers[source_id] - query = insert(SourceURLORM).values(source_id=source_id, species_id=species_id, parser=parser) - dbi.execute(query) - - def get_source_id(self, dbi, parser: str, species_id: int, name: str, division_id: int): - """ Retrieves a source ID from its parser, species ID, name or division ID. - - Parameters - ---------- - dbi: db connection - The database connection to query in - parser: str - The source parser - species_id: int - The ID of the species related to the source - name: str - The source name - division_id: int - The ID of the division related to the source - - Returns - ------- - The source ID. - """ - name = "%"+name+"%" - source_id = None - - query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==species_id) - result = dbi.execute(query) - if result.rowcount == 1: - source_id = result.scalar() - - query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==species_id).filter(SourceUORM.name.like(name)) - result = dbi.execute(query) - if result.rowcount == 1: - source_id = result.scalar() - - if not source_id: - query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==division_id).filter(SourceUORM.name.like(name)) - result = dbi.execute(query).first() - if result: - source_id = result[0] - - return source_id - - def get_taxon_id(self, dbi): - """ Retrieves the species.taxonomy_id value of the meta table in a database. - - Parameters - ---------- - dbi: db connection - The database connection to query in - - Returns - ------- - The taxonomy ID in the database or 1 if not found. - """ - query = select(MetaCORM.meta_value).where(MetaCORM.meta_key=='species.taxonomy_id') - result = dbi.execute(query) - if result.rowcount > 0: - return result.scalar() - - return 1 - - def get_division_id(self, dbi): - """ Retrives the division ID from a database based on the species.division value of the meta table. - - Parameters - ---------- - dbi: db connection - The database connection to query in - - Returns - ------- - The division ID in the database or 1 if not found - """ - query = select(MetaCORM.meta_value).where(MetaCORM.meta_key=='species.division') - result = dbi.execute(query) - - if result.rowcount > 0: - division = result.scalar() - - division_taxon = { - 'Ensembl' : 7742, - 'EnsemblVertebrates' : 7742, - 'Vertebrates' : 7742, - 'EnsemblMetazoa' : 33208, - 'Metazoa' : 33208, - 'Plants' : 33090, - 'EnsemblPlants' : 33090, - } - - division_id = division_taxon.get(division) - if division_id: - return division_id - - return 1 - - def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str=None): - """ Creates directories based on provided data. - - Parameters - ---------- - base_path: str - The base file path - species: str - The species name - release: int - The ensEMBL release number - category: str - The file category - file_name: str, optional - The file name - - Returns - ------- - A file path. - """ - full_path = os.path.join(base_path, species, release, category) - if not os.path.exists(full_path): - os.makedirs(full_path, exist_ok = True) - - if file_name: - return os.path.join(full_path, file_name) - else: - return full_path - - def get_db_from_registry(self, species: str, group: str, release: int, registry: str): - """ Looks up a db in the registry and returns an sqlaclehmy angine for it. - - Parameters - ---------- - species: str - The species name - group: str - The db group (core, ccds, otherfeatures, etc...) - release: int - The ensEMBL release number - registry: str - The registry url - - Returns - ------- - A db engine or 0 if no db is found. - """ - # Fix registry url, if needed - match = re.search(r"^(.*)://(.*)", registry) - if match: registry = match.group(2) - match = re.search(r"(.*)/(.*)", registry) - if match: registry = match.group(1) - - metasearch_url = self.param_required('metasearch_url') - metasearch_body = { - "name_pattern":f'{species}_{group}%', - "filters":[ - { - "meta_key":"schema_version", - "meta_value":release - }, - ], - "servers":[registry] - } - - dbs = requests.post(metasearch_url, json=metasearch_body).json() - dbs = dbs[registry] - - if len(dbs) > 0: - db_url = 'mysql://' + dbs[0] - return db_url - else: - return 0 - - # def get_spark_session(self, data_type): - # if data_type == 'mysql': - # spark = SparkSession.builder.appName('SparkByExamples.com').config("spark.jars", "mysql-connector-java-8.0.13.jar").getOrCreate() - # return spark - # else: - # raise Exception(f'Spark data type {data_type} not supported yet') - - # def get_spark_reader(self, spark_session, data_type, data_url): - # if data_type == 'mysql': - # reader = spark_session.read.format("jdbc").option("driver", "com.mysql.cj.jdbc.Driver").option("url", f'jdbc:{data_url}') - # return reader - # else: - # raise Exception(f'Spark data type {data_type} not supported yet') - - def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: str=None, registry: str=None): - """ Retrives a mapper object based on species. - - Parameters - ---------- - xref_url: str - The xref db connection url - species: str - The species name - base_path: str - The base file path - release: int - The ensEMBL release number - core_db: str, optional - The species core db connection url - registry: str, optional - The registry url - - Returns - ------- - A mapper object - """ - # Need either core_db or registry - if not core_url and not registry: - raise AttributeError(f'Method get_xref_mapper: need to provide either a core DB URL or a registry URL') - - # Create needed db connections - if not core_url: - core_url = self.get_db_from_registry(species, 'core', release, registry) - - core_db = self.get_db_engine(core_url) - xref_db = self.get_db_engine(xref_url) - - # Extract host and dbname from xref url - xref_url_obj = make_url(xref_url) - host = xref_url_obj.host - dbname = xref_url_obj.database - - # Locate the fasta files - cdna_path = self.get_path(base_path, species, release, 'ensembl', 'transcripts.fa'); - pep_path = self.get_path(base_path, species, release, 'ensembl', 'peptides.fa'); - - # Try to find a species-specific mapper first - module_name = f'ensembl.xrefs.mappers.{species}' - class_name = species - found = importlib.find_loader(module_name) - if not found: - module_name = 'ensembl.xrefs.mappers.BasicMapper' - class_name = 'BasicMapper' - - # Create a mapper object - module = importlib.import_module(module_name) - module_class = getattr(module, class_name) - mapper = module_class() - - mapper.xref(xref_db) - mapper.add_meta_pair('xref', f'{host}:{dbname}') - mapper.core(core_db) - mapper.add_meta_pair('species', f'{host}:{dbname}') - mapper.dna_file(cdna_path) - mapper.protein_file(pep_path) - mapper.log_file(self._log_file) - - return mapper - - + with open(file, "r") as fh: + groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) + for key, group in groups: + yield list(group) + + def create_xref_db(self, url: str, config_file: str) -> None: + """Creates the xref database from model. + This function always drops the database defined by the provided URL (if it exists) before creating a new one. + + Parameters + ---------- + url: str + The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] + config_file: str + The name and path of the .ini file that has information about xref sources and species + """ + engine = create_engine(url, isolation_level="AUTOCOMMIT") + + # Drop database and create again + if database_exists(engine.url): + drop_database(engine.url) + create_database(engine.url) + XrefUpdateDB.metadata.create_all(engine) + + xref_dbi = engine.connect() + self.populate_xref_db(xref_dbi, config_file) + + def populate_xref_db(self, dbi: Connection, config_file: str) -> None: + """Populates the xref database with configuration data. + + Parameters + ---------- + dbi: db connection + The xref database connection + config_file: str + The name and path of the .ini file that has information about xref sources and species to populate the database with + + Raises + ------ + KeyError + If a source exists in a species section in the configuration file, but has no source section of its own. + """ + source_ids, source_parsers, species_sources = {}, {}, {} + species_sections, sources_sections = {}, {} + + config = ConfigParser() + config.read(config_file) + + for section_name in config.sections(): + section = config[section_name] + (keyword, name) = re.split(r"\s+", section_name) + + if keyword == "source": + sources_sections[name] = section + elif keyword == "species": + species_sections[name] = section + + # Parse species sections + for species_name, section in species_sections.items(): + taxonomy_ids = section.get("taxonomy_id").split(",") + sources = section.get("sources") + aliases = section.get("aliases", species_name) + + species_id = taxonomy_ids[0] + + for tax_id in taxonomy_ids: + # Add new species + dbi.execute( + insert(SpeciesORM).values( + species_id=species_id, + taxonomy_id=tax_id, + name=species_name, + aliases=aliases, + ) + ) + + species_sources[species_id] = sources + + source_id = 0 + # Parse source sections + for source_name, section in sorted(sources_sections.items()): + source_id += 1 + source_db_name = section.get("name") + order = section.get("order") + priority = section.get("priority") + priority_description = section.get("prio_descr", "") + status = section.get("status", "NOIDEA") + parser = section.get("parser") + + # Add new source + dbi.execute( + insert(SourceUORM).values( + name=source_db_name, + source_release="1", + ordered=order, + priority=priority, + priority_description=priority_description, + status=status, + ) + ) + + source_ids[source_name] = source_id + source_parsers[source_id] = parser + + # Add source url rows + for species_id, sources in species_sources.items(): + source_names = sources.split(",") + + for source_name in source_names: + if not source_ids.get(source_name): + raise KeyError( + f"No source section found for {source_name} in config file" + ) + + source_id = source_ids[source_name] + parser = source_parsers[source_id] + dbi.execute( + insert(SourceURLORM).values( + source_id=source_id, species_id=species_id, parser=parser + ) + ) + + def get_source_id(self, dbi: Connection, parser: str, species_id: int, name: str, division_id: int) -> Optional[int]: + """Retrieves a source ID from its parser, species ID, name or division ID. + + Parameters + ---------- + dbi: db connection + The database connection to query in + parser: str + The source parser + species_id: int + The ID of the species related to the source + name: str + The source name + division_id: int + The ID of the division related to the source + + Returns + ------- + The source ID. + """ + name = "%" + name + "%" + source_id = None + + query = select(SourceURLORM.source_id).where( + SourceUORM.source_id == SourceURLORM.source_id, + SourceURLORM.parser == parser, + SourceURLORM.species_id == species_id, + ) + result = dbi.execute(query) + if result.rowcount == 1: + source_id = result.scalar() + + query = ( + select(SourceURLORM.source_id) + .where( + SourceUORM.source_id == SourceURLORM.source_id, + SourceURLORM.parser == parser, + SourceURLORM.species_id == species_id, + ) + .filter(SourceUORM.name.like(name)) + ) + result = dbi.execute(query) + if result.rowcount == 1: + source_id = result.scalar() + + if not source_id: + query = ( + select(SourceURLORM.source_id) + .where( + SourceUORM.source_id == SourceURLORM.source_id, + SourceURLORM.parser == parser, + SourceURLORM.species_id == division_id, + ) + .filter(SourceUORM.name.like(name)) + ) + result = dbi.execute(query).first() + if result: + source_id = result[0] + + return source_id + + def get_taxon_id(self, dbi: Connection) -> int: + """Retrieves the species.taxonomy_id value of the meta table in a database. + + Parameters + ---------- + dbi: db connection + The database connection to query in + + Returns + ------- + The taxonomy ID in the database or 1 if not found. + """ + result = dbi.execute( + select(MetaCORM.meta_value).where( + MetaCORM.meta_key == "species.taxonomy_id" + ) + ) + if result.rowcount > 0: + return int(result.scalar()) + + return 1 + + def get_division_id(self, dbi: Connection) -> int: + """Retrives the division ID from a database based on the species.division value of the meta table. + + Parameters + ---------- + dbi: db connection + The database connection to query in + + Returns + ------- + The division ID in the database or 1 if not found + """ + result = dbi.execute( + select(MetaCORM.meta_value).where(MetaCORM.meta_key == "species.division") + ) + + if result.rowcount > 0: + division = result.scalar() + + division_taxon = { + "Ensembl": 7742, + "EnsemblVertebrates": 7742, + "Vertebrates": 7742, + "EnsemblMetazoa": 33208, + "Metazoa": 33208, + "Plants": 33090, + "EnsemblPlants": 33090, + } + + division_id = division_taxon.get(division) + if division_id: + return int(division_id) + + return 1 + + def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str = None) -> str: + """Creates directories based on provided data. + + Parameters + ---------- + base_path: str + The base file path + species: str + The species name + release: int + The ensEMBL release number + category: str + The file category + file_name: str, optional + The file name + + Returns + ------- + A file path. + """ + full_path = os.path.join(base_path, species, release, category) + if not os.path.exists(full_path): + os.makedirs(full_path, exist_ok=True) + + if file_name: + return os.path.join(full_path, file_name) + else: + return full_path + + def get_db_from_registry(self, species: str, group: str, release: int, registry: str) -> Optional[str]: + """Looks up a db in the registry and returns an sqlaclehmy angine for it. + + Parameters + ---------- + species: str + The species name + group: str + The db group (core, ccds, otherfeatures, etc...) + release: int + The ensEMBL release number + registry: str + The registry url + + Returns + ------- + A db engine or 0 if no db is found. + """ + # Fix registry url, if needed + match = re.search(r"^(.*)://(.*)", registry) + if match: + registry = match.group(2) + match = re.search(r"(.*)/(.*)", registry) + if match: + registry = match.group(1) + + metasearch_url = self.param_required("metasearch_url") + metasearch_body = { + "name_pattern": f"{species}_{group}%", + "filters": [ + {"meta_key": "schema_version", "meta_value": str(release)}, + ], + "servers": [registry], + } + + dbs = requests.post(metasearch_url, json=metasearch_body).json() + dbs = dbs[registry] + + if len(dbs) > 0: + db_url = "mysql://" + dbs[0] + return db_url + else: + return None + + def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: str = None, registry: str = None) -> BasicMapper: + """Retrives a mapper object based on species. + + Parameters + ---------- + xref_url: str + The xref db connection url + species: str + The species name + base_path: str + The base file path + release: int + The ensEMBL release number + core_db: str, optional + The species core db connection url + registry: str, optional + The registry url + + Returns + ------- + A mapper object + """ + # Need either core_db or registry + if not core_url and not registry: + raise AttributeError( + f"Method get_xref_mapper: need to provide either a core DB URL or a registry URL" + ) + + # Create needed db connections + if not core_url: + core_url = self.get_db_from_registry(species, "core", release, registry) + + core_db = self.get_db_engine(core_url) + xref_db = self.get_db_engine(xref_url) + + # Extract host and dbname from xref url + xref_url_obj = make_url(xref_url) + host = xref_url_obj.host + dbname = xref_url_obj.database + + # Locate the fasta files + cdna_path = self.get_path( + base_path, species, release, "ensembl", "transcripts.fa" + ) + pep_path = self.get_path(base_path, species, release, "ensembl", "peptides.fa") + + # Try to find a species-specific mapper first + module_name = f"ensembl.production.xrefs.mappers.species.{species}" + class_name = species + found = importlib.util.find_spec(module_name) + if not found: + module_name = "ensembl.production.xrefs.mappers.BasicMapper" + class_name = "BasicMapper" + + # Create a mapper object + module = importlib.import_module(module_name) + module_class = getattr(module, class_name) + mapper = module_class() + + mapper.xref(xref_db) + mapper.add_meta_pair("xref", f"{host}:{dbname}") + mapper.core(core_db) + mapper.add_meta_pair("species", f"{host}:{dbname}") + mapper.dna_file(cdna_path) + mapper.protein_file(pep_path) + mapper.log_file(self._log_file) + mapper.species_dir(os.path.join(base_path, species)) + + return mapper diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py index 7ccb401a7..7edf452e0 100644 --- a/src/python/ensembl/production/xrefs/Checksum.py +++ b/src/python/ensembl/production/xrefs/Checksum.py @@ -16,31 +16,31 @@ from ensembl.production.xrefs.Base import * -class Checksum(Base): - def run(self): - base_path = self.param_required('base_path') - source_db_url = self.param_required('source_db_url') - skip_download = self.param_required('skip_download', {'type': 'bool'}) - - logging.info('Checksum starting with parameters:') - logging.info(f'Param: base_path = {base_path}') - logging.info(f'Param: source_db_url = {source_db_url}') - logging.info(f'Param: skip_download = {skip_download}') - - # Connect to source db - db_engine = self.get_db_engine(source_db_url) - - # Check if checksums already exist - table_nonempty = 0 - if skip_download: - with db_engine.connect() as dbi: - query = select(func.count(ChecksumXrefSORM.checksum_xref_id)) - table_nonempty = dbi.execute(query).scalar() - - # Load checksums from files into db - if not table_nonempty: - self.load_checksum(base_path, source_db_url) - logging.info('Checksum data loaded') - else: - logging.info('Checksum data already exists, skipping loading') +class Checksum(Base): + def run(self): + base_path = self.param_required("base_path", {"type": "str"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + skip_download = self.param_required("skip_download", {"type": "bool"}) + + logging.info("Checksum starting with parameters:") + logging.info(f"Param: base_path = {base_path}") + logging.info(f"Param: source_db_url = {source_db_url}") + logging.info(f"Param: skip_download = {skip_download}") + + # Connect to source db + db_engine = self.get_db_engine(source_db_url) + + # Check if checksums already exist + table_nonempty = 0 + if skip_download: + with db_engine.connect() as dbi: + query = select(func.count(ChecksumXrefSORM.checksum_xref_id)) + table_nonempty = dbi.execute(query).scalar() + + # Load checksums from files into db + if not table_nonempty: + self.load_checksum(base_path, source_db_url) + logging.info("Checksum data loaded") + else: + logging.info("Checksum data already exists, skipping loading") diff --git a/src/python/ensembl/production/xrefs/CoordinateMapping.py b/src/python/ensembl/production/xrefs/CoordinateMapping.py new file mode 100644 index 000000000..d687ebee1 --- /dev/null +++ b/src/python/ensembl/production/xrefs/CoordinateMapping.py @@ -0,0 +1,50 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref module to process the coordinate mappings.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.CoordinateMapper import CoordinateMapper + + +class CoordinateMapping(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + scripts_dir = self.param_required("perl_scripts_dir", {"type": "str"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"CoordinateMapping starting for species '{species_name}'") + + if not core_db_url: + core_db_url = self.get_db_from_registry( + species_name, "core", release, registry + ) + + # Get species id + db_engine = self.get_db_engine(core_db_url) + with db_engine.connect() as core_dbi: + species_id = self.get_taxon_id(core_dbi) + + # Get the appropriate mapper + mapper = self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + + # Process the coordinate xrefs + coord = CoordinateMapper(mapper) + coord.run_coordinatemapping(species_name, species_id, scripts_dir) diff --git a/src/python/ensembl/production/xrefs/DirectXrefs.py b/src/python/ensembl/production/xrefs/DirectXrefs.py new file mode 100644 index 000000000..f6522b274 --- /dev/null +++ b/src/python/ensembl/production/xrefs/DirectXrefs.py @@ -0,0 +1,39 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref module to process direct xrefs.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.DirectXrefsMapper import DirectXrefsMapper + + +class DirectXrefs(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"DirectXrefs starting for species '{species_name}'") + + # Get the appropriate mapper + mapper = self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + + # Process the direct xrefs + direct_mappings = DirectXrefsMapper(mapper) + direct_mappings.process() diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py index 060fcb116..f3b9f20f4 100644 --- a/src/python/ensembl/production/xrefs/DownloadSource.py +++ b/src/python/ensembl/production/xrefs/DownloadSource.py @@ -16,48 +16,58 @@ from ensembl.production.xrefs.Base import * + class DownloadSource(Base): - def run(self): - base_path = self.param_required('base_path') - parser = self.param_required('parser') - name = self.param_required('name') - priority = self.param_required('priority') - source_db_url = self.param_required('source_db_url') - file = self.param_required('file') - skip_download = self.param_required('skip_download', {'type': 'bool'}) - db = self.param('db') - version_file = self.param('version_file') - preparse = self.param('preparse', None, {'type': 'bool'}) - rel_number = self.param('rel_number') - catalog = self.param('catalog') - - logging.info(f'DownloadSource starting for source {name}') - - # Download the main xref file - extra_args = {} - extra_args['skip_download_if_file_present'] = skip_download - extra_args['db'] = db - if rel_number and catalog: - extra_args['rel_number'] = rel_number - extra_args['catalog'] = catalog - file_name = self.download_file(file, base_path, name, extra_args) - - # Download the version file - version = "" - if version_file: - extra_args['release'] = 'version' - version = self.download_file(version_file, base_path, name, extra_args) - - # Update source db - db_engine = self.get_db_engine(source_db_url) - with db_engine.connect() as dbi: - query = insert(SourceSORM).values(name=name, parser=parser).prefix_with('IGNORE') - dbi.execute(query) - - query = select(SourceSORM.source_id).where(SourceSORM.name==name) - source_id = dbi.execute(query).scalar() - - if preparse is None: preparse = False - query = insert(VersionORM).values(source_id=source_id, uri=file_name, index_uri=db, count_seen=priority, revision=version, preparse=preparse).prefix_with('IGNORE') - dbi.execute(query) + def run(self): + base_path = self.param_required("base_path", {"type": "str"}) + parser = self.param_required("parser", {"type": "str"}) + name = self.param_required("name", {"type": "str"}) + priority = self.param_required("priority", {"type": "int"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + file = self.param_required("file", {"type": "str"}) + skip_download = self.param_required("skip_download", {"type": "bool"}) + db = self.param("db", None, {"type": "str"}) + version_file = self.param("version_file", None, {"type": "str"}) + rel_number = self.param("rel_number", None, {"type": "str"}) + catalog = self.param("catalog", None, {"type": "str"}) + + logging.info(f"DownloadSource starting for source {name}") + + # Download the main xref file + extra_args = {} + extra_args["skip_download_if_file_present"] = skip_download + extra_args["db"] = db + if rel_number and catalog: + extra_args["rel_number"] = rel_number + extra_args["catalog"] = catalog + file_name = self.download_file(file, base_path, name, extra_args) + + # Download the version file + version = "" + if version_file: + extra_args["release"] = "version" + version = self.download_file(version_file, base_path, name, extra_args) + + # Update source db + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as dbi: + dbi.execute( + insert(SourceSORM) + .values(name=name, parser=parser) + .prefix_with("IGNORE") + ) + source_id = dbi.execute( + select(SourceSORM.source_id).where(SourceSORM.name == name) + ).scalar() + dbi.execute( + insert(VersionORM) + .values( + source_id=source_id, + file_path=file_name, + db=db, + priority=priority, + revision=version, + ) + .prefix_with("IGNORE") + ) diff --git a/src/python/ensembl/production/xrefs/DumpEnsembl.py b/src/python/ensembl/production/xrefs/DumpEnsembl.py new file mode 100644 index 000000000..84ce39b47 --- /dev/null +++ b/src/python/ensembl/production/xrefs/DumpEnsembl.py @@ -0,0 +1,81 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Dumping module to dump sequence data from a core db.""" + +from ensembl.production.xrefs.Base import * + + +class DumpEnsembl(Base): + def run(self): + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + core_db_url = self.param_required("species_db", {"type": "str"}) + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + retry = self.param("retry", None, {"type": "bool", "default": False}) + + logging.info(f"DumpEnsembl starting for species '{species_name}'") + + # Create files paths + cdna_path = self.get_path( + base_path, species_name, release, "ensembl", "transcripts.fa" + ) + pep_path = self.get_path( + base_path, species_name, release, "ensembl", "peptides.fa" + ) + + # Check if dumping has been done for this run before, to speed up development by not having to re-dump sequences + if ( + not retry + and os.path.exists(cdna_path) + and os.path.getsize(cdna_path) > 0 + and os.path.exists(pep_path) + and os.path.getsize(pep_path) > 0 + ): + logging.info( + f"Dna and peptide data already dumped for species '{species_name}', skipping." + ) + else: + scripts_dir = self.param_required("perl_scripts_dir") + + logging.info(f"Running perl script {scripts_dir}/dump_ensembl.pl") + perl_cmd = f"perl {scripts_dir}/dump_ensembl.pl --cdna_path '{cdna_path}' --pep_path '{pep_path}' --species {species_name} --core_db_url '{core_db_url}' --release {release}" + cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE) + + # Create jobs for peptide dumping and alignment + dataflow_params = { + "species_name": species_name, + "file_path": pep_path, + "xref_db_url": xref_db_url, + "seq_type": "peptide", + } + self.write_output("dump_xref", dataflow_params) + + # Create jobs for cdna dumping and alignment + dataflow_params = { + "species_name": species_name, + "file_path": cdna_path, + "xref_db_url": xref_db_url, + "seq_type": "dna", + } + self.write_output("dump_xref", dataflow_params) + + # Create job for schedule mapping + dataflow_params = { + "species_name": species_name, + "xref_db_url": xref_db_url, + "species_db": core_db_url, + } + self.write_output("schedule_mapping", dataflow_params) diff --git a/src/python/ensembl/production/xrefs/DumpXref.py b/src/python/ensembl/production/xrefs/DumpXref.py new file mode 100644 index 000000000..268c8cae2 --- /dev/null +++ b/src/python/ensembl/production/xrefs/DumpXref.py @@ -0,0 +1,135 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Dumping module to dump xref sequence data from an xref intermediate db.""" + +from ensembl.production.xrefs.Base import * + +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +class DumpXref(Base): + def run(self): + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + file_path = self.param_required("file_path", {"type": "str"}) + seq_type = self.param_required("seq_type", {"type": "str"}) + config_file = self.param_required("config_file", {"type": "str"}) + + logging.info( + f"DumpXref starting for species '{species_name}' with file_path '{file_path}' and seq_type '{seq_type}'" + ) + + # Connect to xref db + xref_dbi = self.get_dbi(xref_db_url) + + # Create output path + full_path = self.get_path(base_path, species_name, release, "xref") + + # Extract sources to download from config file + sources = [] + with open(config_file) as conf_file: + sources = json.load(conf_file) + + # Create hash of available alignment methods + method = {} + query_cutoff = {} + target_cutoff = {} + for source in sources: + if source.get("method"): + method[source["name"]] = source["method"] + query_cutoff[source["name"]] = source.get("query_cutoff") + target_cutoff[source["name"]] = source.get("target_cutoff") + + job_index = 1 + + # Get sources related to sequence type + source_query = select(SourceUORM.name.distinct(), SourceUORM.source_id).where( + SourceUORM.source_id == XrefUORM.source_id, + XrefUORM.xref_id == PrimaryXrefORM.xref_id, + PrimaryXrefORM.sequence_type == seq_type, + ) + for source in xref_dbi.execute(source_query).mappings().all(): + source_name = source.name + source_id = source.source_id + + if re.search(r"RefSeq_.*RNA", source_name): + source_name = "RefSeq_dna" + if re.search("RefSeq_peptide", source_name): + source_name = "RefSeq_peptide" + + if method.get(source_name): + method_name = method[source_name] + source_query_cutoff = query_cutoff[source_name] + source_target_cutoff = target_cutoff[source_name] + + # Open fasta file + file_source_name = source.name + file_source_name = re.sub(r"\/", "", file_source_name) + filename = os.path.join( + full_path, f"{seq_type}_{file_source_name}_{source_id}.fasta" + ) + fasta_fh = open(filename, "w") + + # Get xref sequences + sequence_query = select( + PrimaryXrefORM.xref_id, PrimaryXrefORM.sequence + ).where( + XrefUORM.xref_id == PrimaryXrefORM.xref_id, + PrimaryXrefORM.sequence_type == seq_type, + XrefUORM.source_id == source_id, + ) + for sequence in xref_dbi.execute(sequence_query).mappings().all(): + # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes + seq = sequence.sequence.upper() + if seq_type == "peptide": + seq = re.sub(r"(J|O|U)", "X", seq) + + # Print sequence + SeqIO.write( + SeqRecord(Seq(seq), id=str(sequence.xref_id), description=""), + fasta_fh, + "fasta", + ) + + fasta_fh.close() + + # Pass data into alignment jobs + self.write_output( + "schedule_alignment", + { + "species_name": species_name, + "ensembl_fasta": file_path, + "seq_type": seq_type, + "xref_db_url": xref_db_url, + "method": method_name, + "query_cutoff": source_query_cutoff, + "target_cutoff": source_target_cutoff, + "job_index": job_index, + "source_id": source_id, + "source_name": source_name, + "xref_fasta": filename, + }, + ) + job_index += 1 + + xref_dbi.close() + + if job_index == 1: + with open("dataflow_schedule_alignment.json", "a") as fh: + fh.write("") diff --git a/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py new file mode 100644 index 000000000..3513c7afc --- /dev/null +++ b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py @@ -0,0 +1,100 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Email module to send user emails notifying of advisory DC failures.""" + +from ensembl.production.xrefs.Base import * + +from smtplib import SMTP +from email.message import EmailMessage + + +class EmailAdvisoryXrefReport(Base): + def run(self): + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + pipeline_name = self.param_required("pipeline_name", {"type": "str"}) + email_address = self.param_required("email", {"type": "str"}) + email_server = self.param_required("email_server", {"type": "str"}) + log_timestamp = self.param("log_timestamp", None, {"type": "str"}) + + # Get the path and name of main reports file + formatted_name = re.sub(r"\s", "_", pipeline_name) + main_report_file_name = f"dc_report_{formatted_name}" + if log_timestamp: + log_path = os.path.join(base_path, "logs", log_timestamp) + main_report_file_name = f"{main_report_file_name}_{log_timestamp}.log" + else: + log_path = os.path.join(base_path, "logs") + if not os.path.exists(log_path): + os.makedir(log_path) + main_report_file_name = f"{main_report_file_name}.log" + + main_report_file = os.path.join(log_path, main_report_file_name) + main_fh = open(main_report_file, "a") + + species_with_reports = {} + + # Get species in base path + species_list = os.listdir(base_path) + + for species in species_list: + # Check if reports exist + dc_path = os.path.join(base_path, species, release, "dc_report") + if os.path.exists(dc_path): + # Get report files + dc_files = os.listdir(dc_path) + + # Add each dc report into main report file + for dc_file in dc_files: + with open(os.path.join(dc_path, dc_file), "r") as file: + dc_data = file.read() + + main_fh.write(f"{dc_data}\n") + + dc_name = dc_file.replace(".log", "") + if species_with_reports.get(dc_name): + species_with_reports[dc_name].append(species) + else: + species_with_reports[dc_name] = [species] + + # TO DO: maybe delete individual reports + + main_fh.close() + + email_message = f"Some advisory datachecks have failed for the following species in the xref pipeline run ({pipeline_name}).

" + for dc_name, species_list in species_with_reports.items(): + email_message += f"Datacheck {dc_name}:
" + email_message += "
    " + for species_name in species_list: + email_message += f"
  • {species_name}
  • " + email_message += "
" + + email_message += "
DC failures details attached in this email." + + # Send email + message = EmailMessage() + message["Subject"] = f"Advisory DC Report (release {release})" + message["From"] = email_address + message["To"] = email_address + message.set_content(email_message, "html") + + with open(main_report_file, "rb") as fh: + file_data = fh.read() + message.add_attachment( + file_data, maintype="text", subtype="plain", filename=main_report_file_name + ) + + smtp = SMTP(email_server) + smtp.send_message(message) diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py index 22738d990..4295041a0 100644 --- a/src/python/ensembl/production/xrefs/EmailNotification.py +++ b/src/python/ensembl/production/xrefs/EmailNotification.py @@ -19,118 +19,288 @@ from smtplib import SMTP from email.message import EmailMessage -class EmailNotification(Base): - def run(self): - pipeline_name = self.param_required('pipeline_name') - base_path = self.param_required('base_path') - email_address = self.param_required('email') - email_server = self.param_required('email_server') - log_timestamp = self.param('log_timestamp') - - email_message = f'The {pipeline_name} has completed its run.
' - - if log_timestamp: - # Get the path of the log files - log_path = os.path.join(base_path, 'logs', log_timestamp) - # Read the log file - if os.path.exists(log_path): +class EmailNotification(Base): + def run(self): + pipeline_name = self.param_required("pipeline_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + email_address = self.param_required("email", {"type": "str"}) + email_server = self.param_required("email_server", {"type": "str"}) + log_timestamp = self.param("log_timestamp", None, {"type": "str"}) + + email_message = f"The {pipeline_name} has completed its run.
" + + indent = "   " + + if log_timestamp: + # Get the path of the log files + log_path = os.path.join(base_path, "logs", log_timestamp) + + # Read the log file + if os.path.exists(log_path): + parameters = {} + + # Copy different log files into a main one + main_log_file = self.combine_logs( + base_path, log_timestamp, pipeline_name + ) + + # Read the full logs + with open(main_log_file) as fh: + data = fh.read() + + # Extract parameter data + parameters_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data + ) + parameters = {param[0]: param[1] for param in parameters_list} + + email_message += ( + "
The pipeline was run with the following parameters:
" + ) + for param_name, param_value in parameters.items(): + if param_value == "1" or param_value == "0": + param_value = bool(param_value) + email_message += f"{param_name} = {param_value}
" + + # Extract statistics data from logs + if re.search("Download", pipeline_name): + sources_data, added_species, skipped_species = {}, {}, {} + + # Get sources scheduled for download + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", + data, + ) + sources_data = { + source: {"to_download": 1} for source in matches_list + } + + # Get sources scheduled for cleanup + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", + data, + ) + for source in matches_list: + sources_data[source].update({"to_cleanup": 1}) + + # Get sources cleaned up + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", + data, + ) + for source in matches_list: + sources_data[source].update({"cleaned_up": 1}) + + # Get sources with skipped download + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", + data, + ) + for source in matches_list: + sources_data[source[0]].update( + {"skipped": os.path.dirname(source[1])} + ) + + # Get sources downloaded + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", + data, + ) + for source in matches_list: + sources_data[source[0]].update( + {"downloaded": source[1] + "|" + os.path.dirname(source[2])} + ) + + # Get sources copied from local ftp + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", + data, + ) + for source in matches_list: + sources_data[source[0]].update( + {"copied": os.path.dirname(source[1])} + ) + + # Get skipped species + skipped_species_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) skipped species = (\d+)", + data, + ) + skipped_species = { + source[0]: source[1] for source in skipped_species_list + } + + # Get species with files created + added_species_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) species files created = (\d+)", + data, + ) + added_species = { + source[0]: source[1] for source in added_species_list + } + + # Add source statistics to email message + email_message += "
--Source Statistics--
" + for source_name, source_values in sources.items(): + email_message += f"{source_name}:
" + if source_values.get("to_download"): + email_message += f"{indent}Scheduled for download ✔
" + + if source_values.get("downloaded"): + (download_type, file_path) = source_values[ + "downloaded" + ].split("|") + email_message += f"{indent}File downloaded via {download_type} into {file_path}
" + elif source_values.get("copied"): + email_message += ( + indent + + "File(s) copied from local FTP into %s
" + % (source_values["copied"]) + ) + elif source_values.get("skipped"): + email_message += ( + indent + + "File(s) download skipped, already exists in %s
" + % (source_values["skipped"]) + ) + + if source_values.get("to_cleanup"): + email_message += f"{indent}Scheduled for cleanup ✔
" + if source_values.get("cleaned_up"): + email_message += f"{indent}Cleaned up ✔
" + + # Add species statistics to email message + email_message += "
--Species Statistics--
" + email_message += "Skipped Species (files already exist):
" + for source_name, count in skipped_species.items(): + email_message += f"{indent}{source_name}: {count}
" + email_message += "Added Species (files created):
" + for source_name, count in added_species.items(): + email_message += f"{indent}{source_name}: {count}
" + + email_message += "
To run the Xref Process Pipeline based on the data from this pipeline, use the same --source_db_url, and --config_file values provided to this pipeline." + elif re.search("Process", pipeline_name): + parsed_sources, species_counts = {}, {} + + # Get species mapped + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Mapping starting for species '([\w\/]+)'", + data, + ) + for species_name in matches_list: + species_counts[species_name] = { + "DIRECT": 0, + "INFERRED_PAIR": 0, + "MISC": 0, + "CHECKSUM": 0, + "DEPENDENT": 0, + "SEQUENCE_MATCH": 0, + } + + # Get number of xrefs added per species per source + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tLoaded (\d+) ([\w\/]+) xrefs for '([\w\/]+)'", + data, + ) + for species in matches_list: + count = int(species[0]) + xref_type = species[1] + species_name = species[2] + + prev_count = species_counts[species_name][xref_type] + count += prev_count + + species_counts[species_name][xref_type] = count + + # Get parsed sources per species + matches_list = re.findall( + r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'", + data, + ) + for species in matches_list: + source_name = species[0] + parser = species[1] + species_name = species[2] + + parsed_sources[species_name].update({source_name: parser}) + + # Add species statistics to email message + email_message += "
--Species Statistics--
" + for species_name, species_data in parsed_sources.items(): + email_message += f"{species_name}:
" + email_message += f"{indent}Sources parsed: " + ",".join(keys(species_data)) + + xref_counts = species_counts[species_name] + email_message += indent + "Xrefs added: " + for xref_type, count in xref_counts.items(): + email_message += f"{count} {xref_type} " + + # Send email + message = EmailMessage() + message["Subject"] = f"{pipeline_name} Finished" + message["From"] = email_address + message["To"] = email_address + message.set_content(email_message, "html") + + smtp = SMTP(email_server) + smtp.send_message(message) + + def combine_logs(self, base_path: str, timestamp: str, type: str) -> str: + ordered_processes = { + "download": [ + "ScheduleDownload", + "DownloadSource", + "ScheduleCleanup", + "Cleanup(.*)Source", + "EmailNotification", + ], + "process": [ + "ScheduleSpecies", + "ScheduleParse", + "ParseSource", + "(.*)Parser", + "DumpEnsembl", + "DumpXref", + "ScheduleAlignment", + "Alignment", + "ScheduleMapping", + "DirectXrefs", + "ProcessAlignment", + "RNACentralMapping", + "UniParcMapping", + "CoordinateMapping", + "Mapping", + "AdvisoryXrefReport", + "EmailAdvisoryXrefReport", + "EmailNotification", + ], + } + log_order = ( + ordered_processes["download"] + if re.search("Download", type) + else ordered_processes["process"] + ) + + log_path = os.path.join(base_path, "logs", timestamp) log_files = os.listdir(log_path) - parameters, sources, added_species, skipped_species = {}, {}, {}, {} - - main_log_file = os.path.join(base_path, 'logs', log_timestamp, 'logfile_'+log_timestamp) + main_log_file = os.path.join( + base_path, "logs", timestamp, "logfile_" + timestamp + ) # Copy different log files into a main one - with open(main_log_file, 'a') as out_fh: - for log_file in log_files: - if not re.search(r"^tmp_", log_file): continue - log_file = os.path.join(log_path, log_file) - with open(log_file) as in_fh: - log_data = in_fh.read() - out_fh.write(log_data) - os.remove(log_file) - - # Read the full logs - with open(main_log_file) as fh: - data = fh.read() - - # Extract parameter data - parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data) - parameters = {param[0]: param[1] for param in parameters_list} - - email_message += '
The pipeline was run with the following parameters:
' - for param_name,param_value in parameters.items(): - email_message += f'{param_name} = {param_value}
' - - if re.search('Download', pipeline_name): - #Extract data from logs - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", data) - sources = {source : {'to_download' : 1} for source in sources_list} - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", data) - for source in sources_list: sources[source].update({'to_cleanup' : 1}) - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to preparse: ([\w\/]+)", data) - for source in sources_list: sources[source].update({'to_preparse' : 1}) - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", data) - for source in sources_list: sources[source].update({'cleaned_up' : 1}) - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) preparsed", data) - for source in sources_list: sources[source].update({'preparsed' : 1}) - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", data) - for source in sources_list: sources[source[0]].update({'skipped' : os.path.dirname(source[1])}) - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", data) - for source in sources_list: sources[source[0]].update({'downloaded' : source[1]+"|"+os.path.dirname(source[2])}) - - sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", data) - for source in sources_list: sources[source[0]].update({'copied' : os.path.dirname(source[1])}) - - skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data) - skipped_species = {source[0]: source[1] for source in skipped_species_list} - - added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) species files created = (\d+)", data) - added_species = {source[0]: source[1] for source in added_species_list} - - # Include source statistics - email_message += '
--Source Statistics--
' - for source_name,source_values in sources.items(): - email_message += f'{source_name}:
' - if source_values.get('to_download'): email_message += '   Scheduled for download ✔
' - - if source_values.get('downloaded'): - (download_type, file_path) = source_values['downloaded'].split("|") - email_message += f'   File downloaded via {download_type} into {file_path}
' - elif source_values.get('copied'): email_message += '   File(s) copied from local FTP into %s
' % (source_values['copied']) - elif source_values.get('skipped'): email_message += '   File(s) download skipped, already exists in %s
' % (source_values['skipped']) - - if source_values.get('to_cleanup'): email_message += '   Scheduled for cleanup ✔
' - if source_values.get('cleaned_up'): email_message += '   Cleaned up ✔
' - - if source_values.get('to_preparse'): email_message += '   Scheduled for pre-parse ✔
' - if source_values.get('preparsed'): email_message += '   Pre-parsed ✔
' - - # Include species statistics - email_message += '
--Species Statistics--
' - email_message += 'Skipped Species (files already exist):
' - for source_name, count in skipped_species.items(): - email_message += f'   {source_name}: {count}
' - email_message += 'Added Species (files created):
' - for source_name, count in added_species.items(): - email_message += f'   {source_name}: {count}
' - - email_message += '
To run the Xref Process Pipeline based on the data from this pipeline, use the same --base_path, --source_db_url, and --central_db_url (if preparse was run) values provided to this pipeline.' - - # Send email - message = EmailMessage() - message['Subject'] = f'{pipeline_name} Finished' - message['From'] = email_address - message['To'] = email_address - message.set_content(email_message, 'html') - - smtp = SMTP(email_server) - smtp.send_message(message) - + with open(main_log_file, "a") as out_fh: + for pattern in log_order: + pattern = r"^tmp_logfile_" + pattern + r"_\d+" + matches = [s for s in log_files if re.search(pattern, s)] + + for log_file in matches: + log_file = os.path.join(log_path, log_file) + with open(log_file) as in_fh: + log_data = in_fh.read() + out_fh.write(log_data) + os.remove(log_file) + + return main_log_file diff --git a/src/python/ensembl/production/xrefs/Mapping.py b/src/python/ensembl/production/xrefs/Mapping.py new file mode 100644 index 000000000..838470c1a --- /dev/null +++ b/src/python/ensembl/production/xrefs/Mapping.py @@ -0,0 +1,91 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapping module to map the added xrefs into the core DB.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.ProcessPriorities import ProcessPriorities +from ensembl.production.xrefs.mappers.ProcessPaired import ProcessPaired +from ensembl.production.xrefs.mappers.ProcessMoves import ProcessMoves +from ensembl.production.xrefs.mappers.OfficialNaming import OfficialNaming +from ensembl.production.xrefs.mappers.TestMappings import TestMappings +from ensembl.production.xrefs.mappers.XrefLoader import XrefLoader +from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs + + +class Mapping(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + verbose = self.param("verbose", None, {"default": False}) + + logging.info(f"Mapping starting for species '{species_name}'") + + if not core_db_url: + core_db_url = self.get_db_from_registry( + species_name, "core", release, registry + ) + + # Get species id + db_engine = self.get_db_engine(core_db_url) + with db_engine.connect() as core_dbi: + species_id = self.get_taxon_id(core_dbi) + + # Get the appropriate mapper + mapper = self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + + # Process the xref priorities + priorities = ProcessPriorities(mapper) + priorities.process() + + # Process the paired xrefs + paired = ProcessPaired(mapper) + paired.process() + + # Process the needed xref moves + mover = ProcessMoves(mapper) + mover.biomart_testing(verbose) + mover.source_defined_move(verbose) + mover.process_alt_alleles(verbose) + + # Set the official names for select species + naming = OfficialNaming(mapper) + naming.run(species_id, verbose) + + # Test the validity of the data before mapping into the core DB + warnings = 0 + logging.info("Testing mappings") + tester = TestMappings(mapper) + warnings += tester.direct_stable_id_check() + warnings += tester.xrefs_counts_check() + warnings += tester.name_change_check(mapper.official_name()) + + # Map xref data onto the core DB + loader = XrefLoader(mapper) + loader.update(species_name) + + # Set the display xrefs + display = DisplayXrefs(mapper) + display.build_display_xrefs() + + # Pass datachecks data + dataflow_params = {"species_name": species_name, "species_db": core_db_url} + + self.write_output("datacheck", dataflow_params) diff --git a/src/python/ensembl/production/xrefs/ParseSource.py b/src/python/ensembl/production/xrefs/ParseSource.py new file mode 100644 index 000000000..d3024fe20 --- /dev/null +++ b/src/python/ensembl/production/xrefs/ParseSource.py @@ -0,0 +1,90 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parsing module to call specific file/db parsers based on xref source.""" + +from ensembl.production.xrefs.Base import * + + +class ParseSource(Base): + def run(self): + parser_name = self.param_required("parser", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + species_id = self.param_required("species_id", {"type": "int"}) + file_name = self.param_required("file_name", {"type": "str"}) + source_id = self.param_required("source_id", {"type": "int"}) + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + registry = self.param_required("registry_url", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + core_db_url = self.param_required("core_db_url", {"type": "str"}) + db = self.param("db", None, {"type": "str"}) + release_file = self.param("release_file", None, {"type": "str"}) + source_name = self.param("source_name", None, {"type": "str"}) + + logging.info( + f"ParseSource starting for source '{source_name}' with parser '{parser_name}' for species '{species_name}'" + ) + + failure = 0 + message = None + + # Set parser arguments + args = { + "source_id": source_id, + "species_id": species_id, + "rel_file": release_file, + "species_name": species_name, + "file": file_name, + } + + # Connect to xref db + xref_dbi = self.get_dbi(xref_db_url) + args["xref_dbi"] = xref_dbi + + # Get the extra db, if any + if db: + dba = self.param(f"{db}_db_url") + if not dba: + dba = self.get_db_from_registry(species_name, db, release, registry) + + args["dba"] = dba + args["ensembl_release"] = release + args["core_db_url"] = core_db_url + + # For RefSeqCoordinate source, we run a perl script + if parser_name == "RefSeqCoordinateParser": + args["perl_scripts_dir"] = self.param_required("perl_scripts_dir") + args["xref_db_url"] = xref_db_url + + # For UniProt we need the hgnc file to extract descriptions + if re.search(r"^UniProt", parser_name): + args['hgnc_file'] = self.param("hgnc_file", None, {"type": "str"}) + + # Import the parser + module_name = f"ensembl.production.xrefs.parsers.{parser_name}" + module = importlib.import_module(module_name) + parser_class = getattr(module, parser_name) + parser = parser_class() + + (errors, message) = parser.run(args) + failure += errors + + xref_dbi.close() + + if failure: + raise Exception(f"Parser '{parser_name}' failed with message: {message}") + + logging.info( + f"Source '{source_name}' parsed for species '{species_name}' with the following message:\n{message}" + ) diff --git a/src/python/ensembl/production/xrefs/ProcessAlignment.py b/src/python/ensembl/production/xrefs/ProcessAlignment.py new file mode 100644 index 000000000..1f2295d43 --- /dev/null +++ b/src/python/ensembl/production/xrefs/ProcessAlignment.py @@ -0,0 +1,37 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref module to process the sequence matched allignments.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.ProcessMappings import ProcessMappings + + +class ProcessAlignment(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"ProcessAlignment starting for species '{species_name}'") + + # Get the appropriate mapper + mapper = self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + + # Process the alignments + mappings = ProcessMappings(mapper) + mappings.process_mappings() diff --git a/src/python/ensembl/production/xrefs/RNACentralMapping.py b/src/python/ensembl/production/xrefs/RNACentralMapping.py new file mode 100644 index 000000000..e71353f50 --- /dev/null +++ b/src/python/ensembl/production/xrefs/RNACentralMapping.py @@ -0,0 +1,62 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref module to process the RNAcentral mappings.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.RNACentralMapper import RNACentralMapper +from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum + + +class RNACentralMapping(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"RNACentralMapping starting for species '{species_name}'") + + if not core_db_url: + core_db_url = self.get_db_from_registry( + species_name, "core", release, registry + ) + + # Get species id + db_engine = self.get_db_engine(core_db_url) + with db_engine.connect() as core_dbi: + species_id = self.get_taxon_id(core_dbi) + + # Get the rna central mapper + mapper = RNACentralMapper( + self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + ) + + # Get source id + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as source_dbi: + source_id = self.get_source_id_from_name(source_dbi, "RNACentral") + + method = MySQLChecksum({"MAPPER": mapper}) + results = method.run( + mapper.target(), source_id, mapper.object_type(), source_dbi + ) + + if results: + mapper.upload(results, species_id) diff --git a/src/python/ensembl/production/xrefs/ScheduleAlignment.py b/src/python/ensembl/production/xrefs/ScheduleAlignment.py new file mode 100644 index 000000000..d1fca7697 --- /dev/null +++ b/src/python/ensembl/production/xrefs/ScheduleAlignment.py @@ -0,0 +1,73 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scheduling module to create xref/ensEMBL alignment jobs.""" + +from ensembl.production.xrefs.Base import * + + +class ScheduleAlignment(Base): + def run(self): + species_name = self.param_required("species_name", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + target_file = self.param_required("ensembl_fasta", {"type": "str"}) + source_file = self.param_required("xref_fasta", {"type": "str"}) + seq_type = self.param_required("seq_type", {"type": "str"}) + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + method = self.param_required("method", {"type": "str"}) + query_cutoff = self.param_required("query_cutoff", {"type": "int"}) + target_cutoff = self.param_required("target_cutoff", {"type": "int"}) + source_id = self.param_required("source_id", {"type": "int"}) + source_name = self.param_required("source_name", {"type": "str"}) + job_index = self.param_required("job_index", {"type": "int"}) + + logging.info( + f"ScheduleAlignment starting for species '{species_name}' with seq_type '{seq_type}' and job_index '{job_index}'" + ) + + # Inspect file size to decide on chunking + size = os.stat(target_file).st_size + chunks = int(size / 1000000) + 1 + + # Create output path + output_path = self.get_path(base_path, species_name, release, "alignment") + + # Pass alignment data for each chunk + chunklet = 1 + while chunklet <= chunks: + output_path_chunk = os.path.join( + output_path, + f"{seq_type}_alignment_{source_id}_{chunklet}_of_{chunks}.map", + ) + self.write_output( + "alignment", + { + "species_name": species_name, + "align_method": method, + "query_cutoff": query_cutoff, + "target_cutoff": target_cutoff, + "max_chunks": chunks, + "chunk": chunklet, + "job_index": job_index, + "source_file": source_file, + "target_file": target_file, + "xref_db_url": xref_db_url, + "map_file": output_path_chunk, + "source_id": source_id, + "source_name": source_name, + "seq_type": seq_type, + }, + ) + chunklet += 1 diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py index 58396b33a..eeddf94e1 100644 --- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py +++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py @@ -16,42 +16,48 @@ from ensembl.production.xrefs.Base import * -class ScheduleCleanup(Base): - def run(self): - base_path = self.param_required('base_path') - source_db_url = self.param_required('source_db_url') - clean_files = self.param('clean_files') - clean_dir = self.param('clean_dir') - split_files_by_species = self.param('split_files_by_species') - - logging.info('ScheduleCleanup starting with parameters:') - logging.info(f'Param: base_path = {base_path}') - logging.info(f'Param: source_db_url = {source_db_url}') - logging.info(f'Param: clean_files = {clean_files}') - logging.info(f'Param: clean_dir = {clean_dir}') - logging.info(f'Param: split_files_by_species = {split_files_by_species}') - - # Connect to source db - db_engine = self.get_db_engine(source_db_url) - with db_engine.connect() as dbi: - # Get name and version file for each source - query = select(SourceSORM.name, VersionORM.revision).where(SourceSORM.source_id==VersionORM.source_id).distinct() - sources = dbi.execute(query).mappings().all() - - for source in sources: - # Only cleaning RefSeq and UniProt for now - if not (re.search(r"^RefSeq_(dna|peptide)", source.name) or re.search(r"^Uniprot", source.name)): continue - - # Remove / char from source name to access directory - clean_name = source.name - clean_name = re.sub(r"\/", "", clean_name) - - # Send parameters into cleanup jobs for each source - if os.path.exists(os.path.join(base_path, clean_name)): - logging.info(f'Source to cleanup: {source.name}') - - self.write_output('cleanup_sources', { - 'name' : source.name, - 'version_file' : source.revision - }) +class ScheduleCleanup(Base): + def run(self): + base_path = self.param_required("base_path", {"type": "str"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + clean_files = self.param("clean_files", None, {"type": "bool"}) + clean_dir = self.param("clean_dir", None, {"type": "str"}) + split_files_by_species = self.param("split_files_by_species", None, {"type": "bool"}) + + logging.info("ScheduleCleanup starting with parameters:") + logging.info(f"Param: base_path = {base_path}") + logging.info(f"Param: source_db_url = {source_db_url}") + logging.info(f"Param: clean_files = {clean_files}") + logging.info(f"Param: clean_dir = {clean_dir}") + logging.info(f"Param: split_files_by_species = {split_files_by_species}") + + # Connect to source db + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as dbi: + # Get name and version file for each source + query = select(SourceSORM.name.distinct(), VersionORM.revision).where( + SourceSORM.source_id == VersionORM.source_id + ) + sources = dbi.execute(query).mappings().all() + + for source in sources: + # Only cleaning RefSeq and UniProt for now + if not ( + re.search(r"^RefSeq_(dna|peptide)", source.name) + or re.search(r"^Uniprot", source.name) + ): + continue + + # Remove / char from source name to access directory + clean_name = source.name + clean_name = re.sub(r"\/", "", clean_name) + + # Send parameters into cleanup jobs for each source + if os.path.exists(os.path.join(base_path, clean_name)): + logging.info(f"Source to cleanup: {source.name}") + + self.write_output( + "cleanup_sources", + {"name": source.name, "version_file": source.revision}, + ) diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py index 8001bccc8..f9af93454 100644 --- a/src/python/ensembl/production/xrefs/ScheduleDownload.py +++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py @@ -16,58 +16,54 @@ from ensembl.production.xrefs.Base import * -class ScheduleDownload(Base): - def run(self): - config_file = self.param_required('config_file') - source_db_url = self.param_required('source_db_url') - reuse_db = self.param_required('reuse_db', {'type': 'bool'}) - skip_preparse = self.param('skip_preparse', None, {'type': 'bool', 'default' : False}) - - logging.info('ScheduleDownload starting with parameters:') - logging.info(f'Param: config_file = {config_file}') - logging.info(f'Param: source_db_url = {source_db_url}') - logging.info(f'Param: reuse_db = {reuse_db}') - logging.info(f'Param: skip_preparse = {skip_preparse}') - # Create the source db from url - self.create_source_db(source_db_url, reuse_db) +class ScheduleDownload(Base): + def run(self): + config_file = self.param_required("config_file", {"type": "str"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + reuse_db = self.param_required("reuse_db", {"type": "bool"}) - # Extract sources to download from config file - sources = [] - with open(config_file) as conf_file: - sources = json.load(conf_file) + logging.info("ScheduleDownload starting with parameters:") + logging.info(f"Param: config_file = {config_file}") + logging.info(f"Param: source_db_url = {source_db_url}") + logging.info(f"Param: reuse_db = {reuse_db}") - if len(sources) < 1: - raise IOError(f'No sources found in config file {config_file}. Need sources to run pipeline') + # Create the source db from url + self.create_source_db(source_db_url, reuse_db) - for source_data in sources: - name = source_data['name'] - parser = source_data['parser'] - priority = source_data['priority'] - file = source_data['file'] - db = source_data.get('db') - version_file = source_data.get('release') - preparse = source_data.get('preparse') - rel_number = source_data.get('release_number') - catalog = source_data.get('catalog') + # Extract sources to download from config file + sources = [] + with open(config_file) as conf_file: + sources = json.load(conf_file) - logging.info(f'Source to download: {name}') + if len(sources) < 1: + raise IOError( + f"No sources found in config file {config_file}. Need sources to run pipeline" + ) - # Revert to the old parser if not pre-parsing - if preparse and skip_preparse: - parser = source_data['old_parser'] - preparse = 0 + for source_data in sources: + name = source_data["name"] + parser = source_data["parser"] + priority = source_data["priority"] + file = source_data["file"] + db = source_data.get("db") + version_file = source_data.get("release") + rel_number = source_data.get("release_number") + catalog = source_data.get("catalog") - # Pass the source parameters into download jobs - self.write_output('sources', { - 'parser' : parser, - 'name' : name, - 'priority' : priority, - 'db' : db, - 'version_file' : version_file, - 'preparse' : preparse, - 'file' : file, - 'rel_number' : rel_number, - 'catalog' : catalog - }) + logging.info(f"Source to download: {name}") + # Pass the source parameters into download jobs + self.write_output( + "sources", + { + "parser": parser, + "name": name, + "priority": priority, + "db": db, + "version_file": version_file, + "file": file, + "rel_number": rel_number, + "catalog": catalog, + }, + ) diff --git a/src/python/ensembl/production/xrefs/ScheduleMapping.py b/src/python/ensembl/production/xrefs/ScheduleMapping.py new file mode 100644 index 000000000..44032ad76 --- /dev/null +++ b/src/python/ensembl/production/xrefs/ScheduleMapping.py @@ -0,0 +1,56 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scheduling module to xref mapping jobs.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.CoreInfo import CoreInfo + + +class ScheduleMapping(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"ScheduleMapping starting for species '{species_name}'") + + # Get the appropriate mapper + mapper = self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + + # Load the core data + logging.info("Loading core data") + core_info = CoreInfo(mapper) + core_info.get_core_data() + core_info.get_alt_alleles() + + if not core_db_url: + core_db_url = self.get_db_from_registry( + species_name, "core", release, registry + ) + + # Pass mapping data + dataflow_params = { + "xref_db_url": xref_db_url, + "species_name": species_name, + "species_db": core_db_url, + } + + self.write_output("pre_mapping", dataflow_params) + self.write_output("mapping", dataflow_params) diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py new file mode 100644 index 000000000..cf044e1ee --- /dev/null +++ b/src/python/ensembl/production/xrefs/ScheduleParse.py @@ -0,0 +1,219 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scheduling module to create parsing jobs for each xref source.""" + +import glob + +from ensembl.production.xrefs.Base import * + + +class ScheduleParse(Base): + def run(self): + species_name = self.param_required("species_name", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + registry = self.param_required("registry_url", {"type": "str"}) + order_priority = self.param_required("priority", {"type": "int"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + get_species_file = self.param_required("get_species_file", {"type": "bool"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"ScheduleParse starting for species '{species_name}'") + logging.info(f"\tParam: order_priority = {order_priority}") + logging.info(f"\tParam: source_db_url = {source_db_url}") + logging.info(f"\tParam: xref_db_url = {xref_db_url}") + logging.info(f"\tParams: core_db_url = {core_db_url}") + + dataflow_suffix, dataflow_sub_suffix = "", "" + + # Create Xref database only at priority 1 (one time) + if order_priority == 1: + sources_config_file = self.param_required("sources_config_file") + logging.info(f"\tParam: sources_config_file = {sources_config_file}") + + # Construct xref update url + xref_db_url = make_url(xref_db_url) + xref_db_url = xref_db_url.set( + database=f"{species_name}_xref_update_{release}" + ) + self.create_xref_db(xref_db_url, sources_config_file) + xref_db_url = xref_db_url.render_as_string(hide_password=False) + + dataflow_suffix = "primary_sources" + dataflow_sub_suffix = "schedule_secondary" + elif order_priority == 2: + dataflow_suffix = "secondary_sources" + dataflow_sub_suffix = "schedule_tertiary" + elif order_priority == 3: + dataflow_suffix = "tertiary_sources" + dataflow_sub_suffix = "dump_ensembl" + else: + raise AttributeError("Parameter 'priority' can only be of value 1, 2, or 3") + + # Get core db from registry if not provided + if not core_db_url: + core_db_url = self.get_db_from_registry( + species_name, "core", release, registry + ) + if not re.search(r"^mysql://", core_db_url): + core_db_url = "mysql://" + core_db_url + + # Get species and division ids + db_engine = self.get_db_engine(core_db_url) + with db_engine.connect() as core_dbi: + species_id = self.get_taxon_id(core_dbi) + division_id = self.get_division_id(core_dbi) + + # Retrieve list of sources from source database + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as source_dbi: + query = ( + select( + SourceSORM.name.distinct(), + SourceSORM.parser, + VersionORM.file_path, + VersionORM.clean_path, + VersionORM.db, + VersionORM.priority, + VersionORM.revision, + ) + .where(SourceSORM.source_id == VersionORM.source_id) + .order_by(SourceSORM.name) + ) + sources = source_dbi.execute(query).mappings().all() + + # Connect to the xref intermediate db + xref_dbi = self.get_dbi(xref_db_url) + + hgnc_path = None + total_sources = 0 + + for source in sources: + if source.name == "HGNC": + hgnc_path = source.file_path + + if source.db == "checksum": + continue + if source.priority != order_priority: + continue + + dataflow_params = { + "species_name": species_name, + "species_id": species_id, + "core_db_url": core_db_url, + "xref_db_url": xref_db_url, + } + + # Use clean files if available + file_name = source.file_path + if source.clean_path: + file_name = source.clean_path + + # Some sources are species-specific + source_id = self.get_source_id( + xref_dbi, source.parser, species_id, source.name, division_id + ) + if not source_id: + continue + + dataflow_params["source_id"] = source_id + dataflow_params["source_name"] = source.name + dataflow_params["parser"] = source.parser + if source.revision: + dataflow_params["release_file"] = source.revision + + # Some sources need a connection to a special database + if source.db: + dataflow_params["db"] = source.db + + if source.db != "core": + db_url = self.get_db_from_registry( + species_name, source.db, release, registry + ) + if not db_url: + # Not all species have an otherfeatures database + if source.db == "otherfeatures": + continue + else: + raise LookupError( + f"Cannot use {source.parser} for {species_name}, no {source.db} database" + ) + else: + dataflow_params[f"{source.db}_db_url"] = db_url + + logging.info( + f"Parser '{source.parser}' for source '{source.name}' scheduled for species '{species_name}'" + ) + + if file_name == "Database": + dataflow_params["file_name"] = file_name + self.write_output(dataflow_suffix, dataflow_params) + total_sources += 1 + else: + # Get list of files if directory + if os.path.isdir(file_name): + list_files = os.listdir(file_name) + list_files = [os.path.join(file_name, f) for f in list_files] + else: + list_files = [file_name] + + # For Uniprot and Refseq, files might have been split by species + if get_species_file: + match source.name: + case "Uniprot/SWISSPROT": + file_prefix = "uniprot_sprot" + case "Uniprot/SPTREMBL": + file_prefix = "uniprot_trembl" + case "RefSeq_dna": + file_prefix = "refseq_rna" + case "RefSeq_peptide": + file_prefix = "refseq_protein" + case _: + file_prefix = None + + if file_prefix: + list_files = glob.glob( + file_name + "/**/" + file_prefix + "-" + str(species_id), + recursive=True, + ) + + if source.name == "ZFIN_ID": + list_files = [list_files[0]] + + for file in list_files: + if source.revision and file == source.revision: + continue + + dataflow_params["file_name"] = file + + if re.search(r"^Uniprot", source.name): + hgnc_files = glob.glob(hgnc_path + "/*") + dataflow_params["hgnc_file"] = hgnc_files[0] + + self.write_output(dataflow_suffix, dataflow_params) + total_sources += 1 + + xref_dbi.close() + + if total_sources == 0: + with open(f"dataflow_{dataflow_suffix}.json", "a") as fh: + fh.write("") + + dataflow_params = { + "species_name": species_name, + "species_db": core_db_url, + "xref_db_url": xref_db_url, + } + self.write_output(dataflow_sub_suffix, dataflow_params) diff --git a/src/python/ensembl/production/xrefs/ScheduleSpecies.py b/src/python/ensembl/production/xrefs/ScheduleSpecies.py new file mode 100644 index 000000000..e63de241a --- /dev/null +++ b/src/python/ensembl/production/xrefs/ScheduleSpecies.py @@ -0,0 +1,178 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scheduling module to create a pipeline branch for each species in list or division.""" + +from ensembl.production.xrefs.Base import * + + +class ScheduleSpecies(Base): + def run(self): + run_all = self.param_required("run_all", {"type": "bool"}) + registry = self.param_required("registry_url", {"type": "str"}) + ensembl_release = self.param_required("release", {"type": "int"}) + metasearch_url = self.param_required("metasearch_url", {"type": "str"}) + species = self.param("species", None, {"default": "", "type": "str"}) + antispecies = self.param("antispecies", None, {"default": "", "type": "str"}) + division = self.param("division", None, {"default": "", "type": "str"}) + db_prefix = self.param("db_prefix", None, {"type": "str"}) + group = self.param("group", None, {"default": "core", "type": "str"}) + + logging.info("ScheduleSpecies starting with parameters:") + logging.info(f"\tParam: run_all = {run_all}") + logging.info(f"\tParam: registry = {registry}") + logging.info(f"\tParam: release = {ensembl_release}") + logging.info(f"\tParam: metasearch_url = {metasearch_url}") + logging.info(f"\tParam: species = {species}") + logging.info(f"\tParam: antispecies = {antispecies}") + logging.info(f"\tParam: division = {division}") + logging.info(f"\tParam: db_prefix = {db_prefix}") + logging.info(f"\tParam: group = {group}") + + if species: + species = species.split(",") + if antispecies: + antispecies = antispecies.split(",") + if division: + division = division.split(",") + ensembl_release = str(ensembl_release) + + # Fix registry url, if needed + match = re.search(r"^(.*)://(.*)", registry) + if match: + registry = match.group(2) + match = re.search(r"(.*)/(.*)", registry) + if match: + registry = match.group(1) + + loaded_dbs = {} + dbs = [] + + # Construct the db name pattern + name_pattern = f"%_{group}%" + if db_prefix: + db_prefix = f"{db_prefix}_" + else: + db_prefix = "" + name_pattern = f"{db_prefix}{name_pattern}" + + # Getting all dbs + if run_all: + metasearch_body = { + "name_pattern": name_pattern, + "filters": [ + {"meta_key": "schema_version", "meta_value": ensembl_release}, + ], + "servers": [registry], + } + + # Query registry for all core dbs + dbs = requests.post(metasearch_url, json=metasearch_body).json() + dbs = dbs[registry] + + loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release) + + # Getting dbs for specified species + elif species and len(species) > 0: + for species_name in species: + name_pattern = f"{species_name}_core%" + name_pattern = f"{db_prefix}{name_pattern}" + + metasearch_body = { + "name_pattern": name_pattern, + "filters": [ + {"meta_key": "schema_version", "meta_value": ensembl_release}, + ], + "servers": [registry], + } + + # Query registry for species dbs + species_dbs = requests.post(metasearch_url, json=metasearch_body).json() + + if len(species_dbs[registry]) < 1: + raise IOError( + f"Database not found for {species_name}, check registry parameters" + ) + else: + dbs = dbs + species_dbs[registry] + + loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release) + + # Check if all wanted species were found + for species_name in species: + if not loaded_dbs.get(species_name): + raise IOError( + f"Database not found for {species_name}, check registry parameters" + ) + + # Getting dbs for specified divisions + elif division and len(division) > 0: + for div in division: + metasearch_body = { + "name_pattern": name_pattern, + "filters": [ + {"meta_key": "schema_version", "meta_value": ensembl_release}, + {"meta_key": "species.division", "meta_value": div}, + ], + "servers": [registry], + } + + # Query registry for dbs in division + div_dbs = requests.post(metasearch_url, json=metasearch_body).json() + dbs = dbs + div_dbs[registry] + + loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release) + + if len(loaded_dbs) == 0: + raise IOError(f"Could not find any matching dbs in registry {registry}") + + if run_all: + logging.info(f"All species in {len(loaded_dbs)} databases loaded") + + # Write dataflow output + for species_name, db in loaded_dbs.items(): + if species_name not in antispecies: + self.write_output( + "species", {"species_name": species_name, "species_db": db} + ) + + def check_validity(self, dbs: List(str), prefix: str, group: str, release: str): + valid_dbs = {} + + for db in dbs: + # Extract db name + db_name = db + match = re.search(r"(.*)/(.*)", db_name) + if match: + db_name = match.group(2) + + # Check if db is valid + match = re.search( + r"^(%s)([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_%s(?:_\d+)?_%s_(\w+)$" + % (prefix, group, release), + db_name, + ) + if match: + species_name = match.group(2) + if not valid_dbs.get(species_name): + logging.info(f"Species {species_name} loaded") + valid_dbs[species_name] = db + else: + raise IOError( + f"Database {valid_dbs[species_name]} already loaded for species {species_name}, cannot load second database {db}" + ) + else: + logging.info(f"Could not extract species name from database {db}") + + return valid_dbs diff --git a/src/python/ensembl/production/xrefs/UniParcMapping.py b/src/python/ensembl/production/xrefs/UniParcMapping.py new file mode 100644 index 000000000..86668b621 --- /dev/null +++ b/src/python/ensembl/production/xrefs/UniParcMapping.py @@ -0,0 +1,62 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref module to process the Uniparc mappings.""" + +from ensembl.production.xrefs.Base import * +from ensembl.production.xrefs.mappers.UniParcMapper import UniParcMapper +from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum + + +class UniParcMapping(Base): + def run(self): + xref_db_url = self.param_required("xref_db_url", {"type": "str"}) + species_name = self.param_required("species_name", {"type": "str"}) + base_path = self.param_required("base_path", {"type": "str"}) + release = self.param_required("release", {"type": "int"}) + source_db_url = self.param_required("source_db_url", {"type": "str"}) + registry = self.param("registry_url", None, {"type": "str"}) + core_db_url = self.param("species_db", None, {"type": "str"}) + + logging.info(f"UniParcMapping starting for species '{species_name}'") + + if not core_db_url: + core_db_url = self.get_db_from_registry( + species_name, "core", release, registry + ) + + # Get species id + db_engine = self.get_db_engine(core_db_url) + with db_engine.connect() as core_dbi: + species_id = self.get_taxon_id(core_dbi) + + # Get the uniparc mapper + mapper = UniParcMapper( + self.get_xref_mapper( + xref_db_url, species_name, base_path, release, core_db_url, registry + ) + ) + + # Get source id + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as source_dbi: + source_id = self.get_source_id_from_name(source_dbi, "UniParc") + + method = MySQLChecksum({"MAPPER": mapper}) + results = method.run( + mapper.target(), source_id, mapper.object_type(), source_dbi + ) + + if results: + mapper.upload(results, species_id) diff --git a/src/python/ensembl/production/xrefs/__init__.py b/src/python/ensembl/production/xrefs/__init__.py new file mode 100644 index 000000000..8dd00df34 --- /dev/null +++ b/src/python/ensembl/production/xrefs/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref pipeline modules.""" diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json index e7b0065a4..929450ebf 100644 --- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json +++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json @@ -6,13 +6,6 @@ "db" : "core", "priority" : 1 }, - { - "name" : "CCDS", - "parser" : "CCDSParser", - "file" : "Database", - "db" : "ccds", - "priority" : 1 - }, { "name" : "UniParc", "parser" : "ChecksumParser", @@ -88,10 +81,17 @@ "db" : "core", "priority" : 1 }, + { + "name" : "RFAM", + "parser" : "CoreXrefParser", + "file" : "script:logic_name=>rfam_12.2_gene,object_type=>gene", + "db" : "core", + "priority" : 1 + } { "name" : "RGD", "parser" : "RGDParser", - "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt", + "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES.RAT.txt", "priority" : 2 }, { @@ -110,28 +110,24 @@ }, { "name" : "RefSeq_dna", - "parser" : "RefSeqDatabaseParser", - "old_parser" : "RefSeqGPFFParser", + "parser" : "RefSeqGPFFParser", "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*rna.gbff.gz", "method" : "--bestn 5", "query_cutoff" : 90, "target_cutoff" : 90, "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", - "preparse" : 1, "priority" : 2, "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER", "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release*.files.installed" }, { "name" : "RefSeq_peptide", - "parser" : "RefSeqDatabaseParser", - "old_parser" : "RefSeqGPFFParser", + "parser" : "RefSeqGPFFParser", "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*.protein.gpff.gz", "method" : "--bestn 1", "query_cutoff" : 100, "target_cutoff" : 100, "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", - "preparse" : 1, "priority" : 3, "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER", "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release*.files.installed" @@ -159,25 +155,21 @@ }, { "name" : "Uniprot/SWISSPROT", - "parser" : "UniProtDatabaseParser", - "old_parser" : "UniProtParser", + "parser" : "UniProtParser", "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz", "method" : "--bestn 1", "query_cutoff" : 100, "target_cutoff" : 100, - "preparse" : 1, "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt", "priority" : 1 }, { "name" : "Uniprot/SPTREMBL", - "parser" : "UniProtDatabaseParser", - "old_parser" : "UniProtParser", + "parser" : "UniProtParser", "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_trembl.dat.gz", "method" : "--bestn 1", "query_cutoff" : 100, "target_cutoff" : 100, - "preparse" : 1, "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt", "priority" : 1 }, @@ -187,35 +179,35 @@ "file" : "https://ftp.ebi.ac.uk/pub/databases/genenames/vgnc/tsv/vgnc_gene_set_All.txt.gz", "priority" : 1 }, + { + "name" : "ZFIN_desc", + "parser" : "ZFINDescParser", + "file" : "ftp://zfin.org/pub/transfer/MEOW/zfin_genes.txt", + "priority" : 1 + }, { "name" : "ZFIN_ID", "parser" : "ZFINParser", - "file" : "https://zfin.org/data_transfer/Downloads/refseq.txt", + "file" : "https://zfin.org/data_transfer/Downloads/uniprot.txt", "priority" : 3 }, { "name" : "ZFIN_ID", "parser" : "ZFINParser", - "file" : "https://zfin.org/data_transfer/Downloads/uniprot.txt", - "priority" : 2 + "file" : "https://zfin.org/data_transfer/Downloads/refseq.txt", + "priority" : 3 }, { "name" : "ZFIN_ID", "parser" : "ZFINParser", "file" : "https://zfin.org/data_transfer/Downloads/aliases.txt", - "priority" : 2 + "priority" : 3 }, { "name" : "ZFIN_ID", "parser" : "ZFINParser", - "file" : "https://zfin.org/data_transfer/Downloads/gene_seq.txt", - "priority" : 1 - }, - { - "name" : "ZFIN_desc", - "parser" : "ZFINDescParser", - "file" : "ftp://zfin.org/pub/transfer/MEOW/zfin_genes.txt", - "priority" : 1 + "file" : "https://zfin.org/downloads/ensembl_1_to_1.txt", + "priority" : 3 }, { "name" : "cint_jgi_v1", @@ -245,4 +237,4 @@ "db" : "ccds", "priority" : 3 } -] +] \ No newline at end of file diff --git a/src/python/ensembl/production/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini index 5a4830d52..ca3452245 100644 --- a/src/python/ensembl/production/xrefs/config/xref_config.ini +++ b/src/python/ensembl/production/xrefs/config/xref_config.ini @@ -519,8 +519,7 @@ name = RefSeq_dna order = 15 priority = 2 prio_descr = refseq -parser = RefSeqDatabaseParser -old_parser = RefSeqGPFFParser +parser = RefSeqGPFFParser [source RefSeq_dna::gencode] # Used by human and mouse @@ -543,8 +542,7 @@ name = RefSeq_dna order = 15 priority = 2 prio_descr = refseq -parser = RefSeqDatabaseParser -old_parser = RefSeqGPFFParser +parser = RefSeqGPFFParser [source RefSeq_dna::MULTI-complete] # Used by phaeodactylum_tricornutum @@ -728,8 +726,7 @@ name = RefSeq_peptide order = 25 priority = 2 prio_descr = refseq -parser = RefSeqDatabaseParser -old_parser = RefSeqGPFFParser +parser = RefSeqGPFFParser [source SGD_GENE::saccharomyces_cerevisiae] # Used by saccharomyces_cerevisiae @@ -822,8 +819,7 @@ name = Uniprot/SPTREMBL order = 20 priority = 3 prio_descr = sequence_mapped -parser = UniProtDatabaseParser -old_parser = UniProtParser +parser = UniProtParser dependent_on = MIM [source Uniprot/SPTREMBL::gencode] @@ -851,8 +847,7 @@ name = Uniprot/SWISSPROT order = 20 priority = 3 prio_descr = sequence_mapped -parser = UniProtDatabaseParser -old_parser = UniProtParser +parser = UniProtParser dependent_on = MIM [source Uniprot/SWISSPROT::gencode] @@ -937,6 +932,13 @@ order = 20 priority = 1 parser = UniProtParser +[source UniProt::STRING] +# Special source used in UniProtParser. No species uses this source. +name = STRING +order = 20 +priority = 1 +parser = UniProtParser + [source UniParc::MULTI] name = UniParc order = 20 @@ -1004,6 +1006,14 @@ parser = XenopusJamboreeParser name = ZFIN_ID order = 31 priority = 1 +prio_descr = direct +parser = ZFINParser + +[source ZFIN_ID::danio_rerio#02] +# Used by danio_rerio +name = ZFIN_ID +order = 31 +priority = 2 prio_descr = uniprot/refseq parser = ZFINParser @@ -1488,8 +1498,6 @@ sources = ZFIN_ID::danio_rerio#01,ZFIN_ID::danio_rerio#03 taxonomy_id = 10116 sources = RGD::rattus_norvegicus,RGD::rattus_norvegicus#02 - - [species ciona_intestinalis] taxonomy_id = 7719 sources = cint_jgi_v1::ciona_intestinalis @@ -1588,7 +1596,7 @@ sources = PomBase::schizosaccharomyces_pombe [species plants] taxonomy_id = 33090 -sources = EntrezGene::MULTI,Reactome::MULTI,RNACentral::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_import::otherfeatures,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,UniParc::MULTI,RFAM::MULTI,miRBase::MULTI,ArrayExpress::MULTI,ncRNA_EG::EG,misc_EG::EG +sources = EntrezGene::MULTI,Reactome::MULTI,RNACentral::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_peptide::MULTI-Plants,RefSeq_import::otherfeatures,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,UniParc::MULTI,RFAM::MULTI,miRBase::MULTI,ArrayExpress::MULTI,ncRNA_EG::EG,misc_EG::EG [species glycine_max] taxonomy_id = 3847 @@ -1677,4 +1685,3 @@ sources = wormbase::tmuris [species protist] taxonomy_id = 2759 sources = EntrezGene::MULTI,RefSeq_dna::MULTI-complete,RefSeq_peptide::MULTI-complete,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,TRNASCAN_SE::MULTI,RNAMMER::MULTI,ArrayExpress::EG,PHIbase::MULTI,miRBase::MULTI,misc_EG::EG,RFAM::EG - diff --git a/src/python/ensembl/production/xrefs/mappers/BasicMapper.py b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py new file mode 100644 index 000000000..362eea354 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py @@ -0,0 +1,432 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base module to handle xref mapping.""" + +import re +import os +import sys +import warnings +import logging +import subprocess + +from sqlalchemy import select, insert, update, func, delete, desc, text +from sqlalchemy.engine import Engine, Connection +from sqlalchemy.orm import Session, sessionmaker, aliased +from sqlalchemy.sql.expression import case +from sqlalchemy.sql import Select +from typing import Dict, Any, List, Optional, Tuple + +from ensembl.core.models import ( + Gene as GeneORM, + Transcript as TranscriptORM, + Translation as TranslationORM, + Meta as MetaCORM, + AltAllele as AltAlleleCORM, + t_alt_allele_attrib as AltAlleleAttribORM, + ObjectXref as ObjectXrefCORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + UnmappedObject as UnmappedObjectORM, + UnmappedReason as UnmappedReasonORM, + Analysis as AnalysisORM, + OntologyXref as OntologyXrefORM, + ExternalSynonym as ExternalSynonymORM, + DependentXref as DependentXrefCORM, + IdentityXref as IdentityXrefCORM, + SeqRegionAttrib as SeqRegionAttribORM, + AttribType as AttribTypeORM, +) + +from ensembl.xrefs.xref_update_db_model import ( + GeneTranscriptTranslation as GeneTranscriptTranslationORM, + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + TranslationStableId as TranslationStableIdORM, + Meta as MetaUORM, + ProcessStatus as ProcessStatusORM, + ObjectXref as ObjectXrefUORM, + AltAllele as AltAlleleUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + GeneDirectXref as GeneDirectXrefORM, + TranscriptDirectXref as TranscriptDirectXrefORM, + TranslationDirectXref as TranslationDirectXrefORM, + Mapping as MappingORM, + MappingJobs as MappingJobsORM, + CoordinateXref as CoordinateXrefORM, + Synonym as SynonymORM, + Pairs as PairsORM, + PrimaryXref as PrimaryXrefORM, + DisplayXrefPriority as DisplayXrefPriorityORM, + GeneDescPriority as GeneDescPriorityORM, +) + +from datetime import datetime + + +class BasicMapper: + def __init__(self, args: Dict[str, Any] = None) -> None: + if args is None: + args = {} + + self._xref = args.get("xref") + self._core = args.get("core") + self._dna_file = args.get("dna_file") + self._protein_file = args.get("protein_file") + self._log_file = args.get("log_file") + self._species_dir = args.get("species_dir") + + def xref(self, xref_db_engine: Engine = None) -> Engine: + """Getter/Setter for the xref DB engine. + + Parameters + ---------- + xref_db_engine: sqlalchemy.engine.Engine, optional + The xref DB engine + + Returns + ------- + The xref DB engine. + """ + if xref_db_engine: + self._xref = xref_db_engine + + return self._xref + + def core(self, core_db_engine: Engine = None) -> Engine: + """Getter/Setter for the core DB engine. + + Parameters + ---------- + core_db_engine: sqlalchemy.engine.Engine, optional + The core DB engine + + Returns + ------- + The core DB engine. + """ + if core_db_engine: + self._core = core_db_engine + + return self._core + + def dna_file(self, dna_file: str = None) -> str: + """Getter/Setter for the dna file. + + Parameters + ---------- + dna_file: str, optional + The path to the dna file + + Returns + ------- + The dna file path + """ + if dna_file: + self._dna_file = dna_file + + return self._dna_file + + def protein_file(self, protein_file: str = None) -> str: + """Getter/Setter for the protein file. + + Parameters + ---------- + protein_file: str, optional + The path to the protein file + + Returns + ------- + The protein file path + """ + if protein_file: + self._protein_file = protein_file + + return self._protein_file + + def log_file(self, log_file: str = None) -> str: + """Getter/Setter for the log file. + + Parameters + ---------- + log_file: str, optional + The path to the log file + + Returns + ------- + The log file path + """ + if log_file: + self._log_file = log_file + + return self._log_file + + def species_dir(self, species_dir: str = None) -> str: + """Getter/Setter for the species directory. + + Parameters + ---------- + species_dir: str, optional + The path to the species directory + + Returns + ------- + The species directory + """ + if species_dir: + self._species_dir = species_dir + + return self._species_dir + + def official_name(self) -> None: + return None + + def add_meta_pair(self, meta_key: str, meta_value: str) -> None: + """Adds a row to the meta table. + + Parameters + ---------- + meta_key: str + The value of the 'meta_key' column in the meta table + meta_value: str + The value of the 'meta_value' column in the meta table + """ + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + with self.xref().connect() as dbi: + dbi.execute( + insert(MetaUORM).values( + meta_key=meta_key, meta_value=meta_value, date=now + ) + ) + + def get_meta_value(self, meta_key: str) -> str: + """Gets a value from the meta table based on key. + + Parameters + ---------- + meta_key: str + The value of the 'meta_key' column in the meta table + """ + with self.xref().connect() as dbi: + query = ( + select(MetaUORM.meta_value) + .where(MetaUORM.meta_key == meta_key) + .order_by(MetaUORM.meta_id.desc()) + ) + value = dbi.execute(query).first() + + if value: + value = value[0] + return value + + def update_process_status(self, status: str) -> None: + """Adds a row to the process_status table. + + Parameters + ---------- + status: str + The value of the 'status' column on the process_status table + """ + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + with self.xref().connect() as dbi: + dbi.execute(insert(ProcessStatusORM).values(status=status, date=now)) + + def set_up_logging(self) -> None: + log_file = self.log_file() + + console_handler = logging.StreamHandler() + file_handler = logging.FileHandler(log_file, mode="a") + console_handler.setLevel(logging.WARNING) + file_handler.setLevel(logging.DEBUG) + + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s | %(levelname)s | %(message)s", + datefmt="%d-%b-%Y %H:%M:%S", + handlers=[console_handler, file_handler], + ) + + def log_progress(self, message: str) -> None: + logging.info(message) + + def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int: + """Retrieves the object_xref row ID from ensembl ID, xref ID, ensembl type, and linkage type. + + Parameters + ---------- + ensembl_id: int + The ensEMBL feature internal ID + xref_id: int + The xref ID related to the object xref + ensembl_type: str + The feature type (gene, transcript, or translation) + linkage_type: str + The type of link between the xref and ensEMBL feature + master_xref_id: int, optional + The xref ID of the xref that this object xref is dependent on + status: str, optional + The object xref status + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + The object xref ID, if found (else None). + """ + object_xref_id = None + + query = select(ObjectXrefUORM.object_xref_id).where( + ObjectXrefUORM.ensembl_id == ensembl_id, + ObjectXrefUORM.xref_id == xref_id, + ObjectXrefUORM.ensembl_object_type == ensembl_type, + ObjectXrefUORM.linkage_type == linkage_type, + ) + if master_xref_id is not None: + query = query.where(ObjectXrefUORM.master_xref_id == master_xref_id) + if status is not None: + query = query.where(ObjectXrefUORM.ox_status == status) + + result = dbi.execute(query).fetchall() + + if result: + object_xref_id = result[0][0] + + return object_xref_id + + def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int: + """Adds data into object xref table in a database. + + Parameters + ---------- + ensembl_id: int + The ensEMBL feature internal ID + xref_id: int + The xref ID related to the object xref + ensembl_type: str + The feature type (gene, transcript, or translation) + linkage_type: str + The type of link between the xref and ensEMBL feature + master_xref_id: int, optional + The xref ID of the xref that this object xref is dependent on + status: str, optional + The object xref status + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + The inserted object xref ID. + """ + query = insert(ObjectXrefUORM).values( + ensembl_id=ensembl_id, + xref_id=xref_id, + ensembl_object_type=ensembl_type, + linkage_type=linkage_type, + ) + if master_xref_id is not None: + query = query.values(master_xref_id=master_xref_id) + if status is not None: + query = query.values(ox_status=status) + dbi.execute(query) + + object_xref_id = self.get_object_xref_id( + ensembl_id, xref_id, ensembl_type, linkage_type, dbi, master_xref_id, status + ) + return object_xref_id + + def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> None: + logging.info( + f"{db_name} is associated with both {type1} and {type2} object types. Fixing." + ) + + # Figure out where to move xref to + to_type, from_type, to_id, from_id = None, None, None, None + if type1 == "Gene" or type2 == "Gene": + to_type = "Gene" + + if type1 == "Translation" or type2 == "Translation": + from_type = "Translation" + else: + from_type = "Transcript" + else: + to_type = "Transcript" + from_type = "Translation" + + logging.info(f"Moving all associations from {from_type} to {to_type}") + + to_id = getattr(GeneTranscriptTranslationORM, to_type.lower() + "_id") + from_id = getattr(GeneTranscriptTranslationORM, from_type.lower() + "_id") + + # Move the object xref + query = ( + update(ObjectXrefUORM) + .values(ensembl_object_type=to_type, ensembl_id=to_id) + .where( + ObjectXrefUORM.ensembl_object_type == from_type, + ObjectXrefUORM.ensembl_id == from_id, + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.ox_status == "DUMP_OUT", + SourceUORM.name == db_name, + ) + .prefix_with("IGNORE") + ) + dbi.execute(query) + + # Delete moved object xref + query = ( + select(ObjectXrefUORM.object_xref_id) + .outerjoin( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .where( + ObjectXrefUORM.ensembl_object_type == from_type, + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.ox_status == "DUMP_OUT", + SourceUORM.name == db_name, + ) + ) + for row in dbi.execute(query).mappings().all(): + dbi.execute( + delete(ObjectXrefUORM).where( + ObjectXrefUORM.object_xref_id == row.object_xref_id + ) + ) + dbi.execute( + delete(IdentityXrefUORM).where( + IdentityXrefUORM.object_xref_id == row.object_xref_id + ) + ) + + # Delete dependent xref + sub_query = select(ObjectXrefUORM.object_xref_id) + query = delete(DependentXrefUORM).where( + DependentXrefUORM.object_xref_id.not_in(sub_query) + ) + dbi.execute(query) + + def update_object_xref_status(self, object_xref_id: int, status: str, dbi: Connection) -> None: + query = ( + update(ObjectXrefUORM) + .where(ObjectXrefUORM.object_xref_id == object_xref_id) + .values(ox_status=status) + ) + dbi.execute(query) diff --git a/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py new file mode 100644 index 000000000..535bb7ad6 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py @@ -0,0 +1,111 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing Checksum xref data.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class ChecksumMapper(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + self.mapper(mapper) + mapper.set_up_logging() + + def target(self) -> None: + return None + + def mapper(self, mapper: BasicMapper = None): + if mapper: + self._mapper = mapper + + return self._mapper + + def upload(self, results: List[Dict[str, Any]], species_id: int) -> None: + if not species_id: + logging.info("No species_id found, doing nothing") + return + + source_id = self.source_id() + + logging.info("Deleting records from previous possible upload runs") + with self.xref().connect() as xref_dbi: + self._delete_entries("object_xref", source_id, xref_dbi) + self._delete_entries("xref", source_id, xref_dbi) + + # Start session, in order to get inserted IDs + Session = sessionmaker(self.xref()) + with Session.begin() as session: + logging.info("Starting xref insertion") + + # Record UPIs to make sure we do not attempt to insert duplicate UPIs + upi_xref_id = {} + for row in results: + upi = row["upi"] + if upi_xref_id.get(upi): + row["xref_id"] = upi_xref_id[upi] + else: + xref_object = XrefUORM( + source_id=source_id, + accession=upi, + label=upi, + version=1, + species_id=species_id, + info_type="CHECKSUM", + ) + session.add(xref_object) + session.flush() + row["xref_id"] = xref_object.xref_id + upi_xref_id[upi] = xref_object.xref_id + + logging.info("Starting object_xref insertion") + for row in results: + object_xref_object = ObjectXrefUORM( + ensembl_id=row["id"], + ensembl_object_type=row["object_type"], + xref_id=row["xref_id"], + linkage_type="CHECKSUM", + ox_status="DUMP_OUT", + ) + session.add(object_xref_object) + + logging.info("Finished insertions") + + def source_id(self) -> int: + source_name = self.external_db_name() + + with self.xref().connect() as dbi: + source_id = dbi.execute( + select(SourceUORM.source_id).where(SourceUORM.name == source_name) + ).scalar() + + return int(source_id) + + def _delete_entries(self, table: str, source_id: int, dbi: Connection) -> None: + if table == "xref": + query = delete(XrefUORM).where(XrefUORM.source_id == source_id) + elif table == "object_xref": + query = delete(ObjectXrefUORM).where( + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + XrefUORM.source_id == source_id, + ) + else: + raise AttributeError( + f"Invalid table to delete: {table}. Can either be 'xref' or 'object_xref'." + ) + + count = dbi.execute(query).rowcount + + logging.info(f"Deleted {count} entries from '{table}' table") diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py new file mode 100644 index 000000000..d938d966c --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py @@ -0,0 +1,130 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing coordinate xref data.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * +from ensembl.common.RangeRegistry import RangeRegistry + +coding_weight = 2 +ens_weight = 3 +transcript_score_threshold = 0.75 + + +class CoordinateMapper(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + self.species_dir(mapper.species_dir()) + mapper.set_up_logging() + + def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir: str) -> None: + self.update_process_status("coordinate_xrefs_started") + + # We only do coordinate mapping for mouse and human for now + if species_name != "mus_musculus" and species_name != "homo_sapiens": + self.update_process_status("coordinate_xref_finished") + return + + output_dir = self.species_dir() + xref_filename = os.path.join(output_dir, "xref_coord.txt") + object_xref_filename = os.path.join(output_dir, "object_xref_coord.txt") + unmapped_reason_filename = os.path.join(output_dir, "unmapped_reason_coord.txt") + unmapped_object_filename = os.path.join(output_dir, "unmapped_object_coord.txt") + + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + # Figure out the last used IDs in the core DB + xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() + object_xref_id = core_dbi.execute( + select(func.max(ObjectXrefCORM.object_xref_id)) + ).scalar() + unmapped_object_id = core_dbi.execute( + select(func.max(UnmappedObjectORM.unmapped_object_id)) + ).scalar() + unmapped_reason_id = core_dbi.execute( + select(func.max(UnmappedReasonORM.unmapped_reason_id)) + ).scalar() + + logging.info( + f"Last used xref_id={xref_id}, object_xref_id={object_xref_id}, unmapped_object_id={unmapped_object_id}, unmapped_reason_id={unmapped_reason_id}" + ) + + # Get an analysis ID + analysis_params = f"weights(coding,ensembl)={coding_weight:.2f},{ens_weight:.2f};transcript_score_threshold={transcript_score_threshold:.2f}" + analysis_id = core_dbi.execute( + select(AnalysisORM.analysis_id).where( + AnalysisORM.logic_name == "xrefcoordinatemapping", + AnalysisORM.parameters == analysis_params, + ) + ).scalar() + + if not analysis_id: + analysis_id = core_dbi.execute( + select(AnalysisORM.analysis_id).where( + AnalysisORM.logic_name == "xrefcoordinatemapping" + ) + ).scalar() + + if analysis_id: + logging.info("Will update 'analysis' table with new parameter settings") + + # Update an existing analysis + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + core_dbi.execute( + update(AnalysisORM) + .where(AnalysisORM.analysis_id == analysis_id) + .values(created=now, parameters=analysis_params) + ) + else: + logging.info( + f"Cannot find analysis ID for this analysis: logic_name = 'xrefcoordinatemapping' parameters = {analysis_params}" + ) + + # Store a new analysis + logging.info("A new analysis will be added") + + analysis_id = core_dbi.execute( + select(func.max(AnalysisORM.analysis_id)) + ).scalar() + logging.info(f"Last used analysis_id is {analysis_id}") + + analysis_id += 1 + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + core_dbi.execute( + insert(AnalysisORM).values( + analysis_id=analysis_id, + created=now, + logic_name="xrefcoordinatemapping", + program="CoordinateMapper.pm", + parameters=analysis_params, + module="CoordinateMapper.pm", + ) + ) + + if analysis_id: + logging.info(f"Analysis ID is {analysis_id}") + + logging.info(f"Running perl script {scripts_dir}/coordinmate_mapper.pl") + perl_cmd = f"perl {scripts_dir}/coordinmate_mapper.pl --xref_db_url '{self.xref()}' --core_db_url '{self.core()}' --species_id {species_id} --output_dir '{output_dir}' --analysis_id {analysis_id}" + cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE) + + self.update_process_status("coordinate_xref_finished") + + self.biomart_fix("UCSC", "Translation", "Gene", xref_dbi) + self.biomart_fix("UCSC", "Transcript", "Gene", xref_dbi) + + xref_dbi.close() + core_dbi.close() diff --git a/src/python/ensembl/production/xrefs/mappers/CoreInfo.py b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py new file mode 100644 index 000000000..eff41f4a2 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py @@ -0,0 +1,320 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for loading core data into an xref database.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class CoreInfo(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def get_core_data(self) -> None: + # Load table gene_transcript_translation + self.load_gene_transcript_translation() + + # Load tables xxx_stable_id + self.load_stable_ids() + + self.update_process_status("core_data_loaded") + + def load_gene_transcript_translation(self) -> None: + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + query = select( + TranscriptORM.gene_id, + TranscriptORM.transcript_id, + TranslationORM.translation_id, + ).outerjoin( + TranslationORM, TranscriptORM.transcript_id == TranslationORM.transcript_id + ) + for row in core_dbi.execute(query).mappings().all(): + xref_dbi.execute( + insert(GeneTranscriptTranslationORM) + .values( + gene_id=row.gene_id, + transcript_id=row.transcript_id, + translation_id=row.translation_id, + ) + .prefix_with("IGNORE") + ) + + xref_dbi.close() + core_dbi.close() + + def load_stable_ids(self) -> None: + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + core_tables = { + "gene": GeneORM, + "transcript": TranscriptORM, + "translation": TranslationORM, + } + xref_tables = { + "gene": GeneStableIdORM, + "transcript": TranscriptStableIdORM, + "translation": TranslationStableIdORM, + } + + for table in ["gene", "transcript", "translation"]: + column = getattr(core_tables[table], f"{table}_id") + core_query = select( + column.label("internal_id"), core_tables[table].stable_id + ) + if table == "transcript": + core_query = core_query.add_columns(TranscriptORM.biotype) + + count = 0 + for row in core_dbi.execute(core_query).mappings().all(): + xref_query = ( + insert(xref_tables[table]) + .values(internal_id=row.internal_id, stable_id=row.stable_id) + .prefix_with("IGNORE") + ) + if table == "transcript": + xref_query = xref_query.values(biotype=row.biotype) + xref_dbi.execute(xref_query) + + count += 1 + + logging.info(f"{count} {table}s loaded from core DB") + + xref_dbi.close() + core_dbi.close() + + def get_alt_alleles(self) -> None: + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + alt_allele_list = self.fetch_all_alt_alleles(core_dbi) + + count = len(alt_allele_list) + alt_id_to_gene_id, gene_id_to_alt_id, is_reference = {}, {}, {} + max_alt_id = 0 + + if count > 0: + xref_dbi.execute(delete(AltAlleleUORM)) + + alt_added, num_of_genes = 0, 0 + + # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table + # Track the reference gene IDs + for group_id, group_members in alt_allele_list.items(): + ref_gene = self.rep_gene_id(group_members) + + # Representative gene not guaranteed, try to find an alternative best fit + if not ref_gene: + logging.info("Get alternative reference gene") + for gene_id in self.get_all_genes(group_members): + query = select(AttribTypeORM.code).where( + SeqRegionAttribORM.seq_region_id == GeneORM.seq_region_id, + AttribTypeORM.attrib_type_id + == SeqRegionAttribORM.attrib_type_id, + GeneORM.gene_id == gene_id, + AttribTypeORM.code == "non_ref", + ) + result = core_dbi.execute(query) + if result.rowcount > 0: + continue + else: + ref_gene = gene_id + break + + if not ref_gene: + logging.warning( + f"Tried very hard but failed to select a representative gene for alt-allele-group {group_id}" + ) + continue + + is_reference[ref_gene] = 1 + others = [] + for member in group_members: + if member[0] != ref_gene: + others.append(member[0]) + + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=group_id, gene_id=ref_gene, is_reference=1 + ) + ) + num_of_genes += 1 + alt_added += 1 + for gene_id in others: + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=group_id, gene_id=gene_id, is_reference=0 + ) + ) + num_of_genes += 1 + + if group_id > max_alt_id: + max_alt_id = group_id + + logging.info(f"{alt_added} alleles found containing {num_of_genes} genes") + else: + logging.info("No alt alleles found for this species") + + # LRGs added as alt_alleles in the XREF system but never added to core + count = 0 + old_count, new_count, lrg_count = 0, 0, 0 + + query = ( + select(ObjectXrefCORM.ensembl_id, GeneORM.gene_id) + .where( + XrefCORM.xref_id == ObjectXrefCORM.xref_id, + ExternalDbORM.external_db_id == XrefCORM.external_db_id, + ObjectXrefCORM.ensembl_object_type == "Gene", + XrefCORM.display_label == GeneORM.stable_id, + ) + .filter(ExternalDbORM.db_name.like("Ens_Hs_gene")) + ) + for row in core_dbi.execute(query).mappings().all(): + # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only + # Else use a new one and add both core and LRG + group_id = self.fetch_group_id_by_gene_id(row.gene_id, core_dbi) + if group_id: + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=group_id, gene_id=row.ensembl_id, is_reference=0 + ) + ) + old_count += 1 + else: + group_id = self.fetch_group_id_by_gene_id(row.ensembl_id, core_dbi) + if group_id: + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=group_id, + gene_id=row.ensembl_id, + is_reference=1, + ) + ) + lrg_count += 1 + logging.info(f"LRG peculiarity\t{row.gene_id}\t{row.ensembl_id}") + else: + max_alt_id += 1 + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=max_alt_id, + gene_id=row.ensembl_id, + is_reference=0, + ) + ) + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=max_alt_id, + gene_id=row.gene_id, + is_reference=1, + ) + ) + new_count += 1 + count += 1 + + if count: + logging.info( + f"Added {count} alt_alleles for the LRGs. {old_count} added to previous alt_alleles and {new_count} new ones" + ) + logging.info(f"LRG problem count = {lrg_count}") + + xref_dbi.close() + core_dbi.close() + + self.update_process_status("alt_alleles_added") + + def fetch_all_alt_alleles(self, dbi: Connection) -> Dict[int, List[List[Any]]]: + group_list = {} + query = None + + if self.is_multispecies(dbi): ##### TO DO: handle multiespecies + raise NotImplementedError(f"Pipeline cannot handle multispecies DBs yet") + + query = select(AltAlleleCORM.alt_allele_group_id).distinct() + + for row in dbi.execute(query).mappings().all(): + group_members = self.fetch_members_by_group_id(row.alt_allele_group_id, dbi) + group_list[row.alt_allele_group_id] = group_members + + return group_list + + def fetch_members_by_group_id(self, group_id: int, dbi: Connection) -> List[List[Any]]: + members = [] + + query = ( + select(AltAlleleCORM.alt_allele_id, AltAlleleCORM.gene_id) + .where(AltAlleleCORM.alt_allele_group_id == group_id) + .order_by(AltAlleleCORM.alt_allele_id) + ) + for row in dbi.execute(query).mappings().all(): + # Fetch alt_allele attributes + attrib_list = {} + query = select(AltAlleleAttribORM.columns.attrib).where( + AltAlleleAttribORM.columns.alt_allele_id == row.alt_allele_id + ) + for attrib_row in dbi.execute(query).mappings().all(): + attrib_list[attrib_row.attrib] = 1 + + members.append([row.gene_id, attrib_list]) + + return members + + def fetch_group_id_by_gene_id(self, gene_id: int, dbi: Connection) -> Optional[int]: + query = ( + select(AltAlleleCORM.alt_allele_group_id) + .where(AltAlleleCORM.gene_id == gene_id) + .order_by(AltAlleleCORM.alt_allele_group_id) + ) + group_list = dbi.execute(query).mappings().all() + + if len(group_list) > 0: + return group_list[0].alt_allele_group_id + + return None + + def is_multispecies(self, dbi: Connection) -> bool: + result = dbi.execute( + select(MetaCORM.meta_value).where( + MetaCORM.meta_key == "species.taxonomy_id" + ) + ) + + if result.rowcount > 1: + return True + else: + return False + + def rep_gene_id(self, group: List[List[Any]]) -> Optional[int]: + for allele in group: + gene_id = allele[0] + allele_type = allele[1] + + if allele_type.get("IS_REPRESENTATIVE"): + return gene_id + + logging.warning( + "No representative allele currently set for this AltAlleleGroup" + ) + return None + + def get_all_genes(self, group: List[List[Any]]) -> List[int]: + gene_ids = [] + + for allele in group: + gene_ids.append(allele[0]) + + return sorted(gene_ids) diff --git a/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py new file mode 100644 index 000000000..c3113dee3 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py @@ -0,0 +1,182 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing direct xref data.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class DirectXrefsMapper(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def process(self) -> None: + xref_dbi = self.xref().connect() + + db_tables = { + "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM}, + "transcript": { + "direct": TranscriptDirectXrefORM, + "stable_id": TranscriptStableIdORM, + }, + "translation": { + "direct": TranslationDirectXrefORM, + "stable_id": TranslationStableIdORM, + }, + } + + err_count = {} + object_xref_id = 0 + + for table in ["gene", "transcript", "translation"]: + direct_table = db_tables[table]["direct"] + stable_id_table = db_tables[table]["stable_id"] + + count, duplicate_direct_count, duplicate_dependent_count = 0, 0, 0 + + # Get the direct xrefs + stable_id_query = ( + select( + SourceUORM.name, + direct_table.general_xref_id, + stable_id_table.internal_id, + direct_table.ensembl_stable_id, + ) + .outerjoin( + stable_id_table, + stable_id_table.stable_id == direct_table.ensembl_stable_id, + ) + .where( + XrefUORM.xref_id == direct_table.general_xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ) + ) + for row in xref_dbi.execute(stable_id_query).mappings().all(): + dbname = row.name + xref_id = row.general_xref_id + internal_id = row.internal_id + stable_id = row.ensembl_stable_id + + # Check if internal id exists. If not, it is an internal id already or stable_id no longer exists + if internal_id is None: + if re.search(r"^\d+$", stable_id): + internal_id = stable_id + else: + err_count[dbname] = err_count.get(dbname, 0) + 1 + continue + + object_xref_id += 1 + count += 1 + master_xref_ids = [] + + if internal_id == 0: + raise LookupError( + f"Problem: could not find stable id {stable_id} and got past the first check for {dbname}" + ) + + # Insert into object xref table + object_xref_id = self.get_object_xref_id( + internal_id, xref_id, table, "DIRECT", xref_dbi + ) + if object_xref_id: + duplicate_direct_count += 1 + continue + else: + object_xref_id = self.add_object_xref( + internal_id, xref_id, table, "DIRECT", xref_dbi + ) + + # Insert into identity xref table + xref_dbi.execute( + insert(IdentityXrefUORM).values( + object_xref_id=object_xref_id, + query_identity=100, + target_identity=100, + ) + ) + master_xref_ids.append(xref_id) + + duplicate_dependent_count += self.process_dependents( + { + "master_xrefs": master_xref_ids, + "dup_count": duplicate_dependent_count, + "table": table, + "internal_id": internal_id, + }, + xref_dbi, + ) + + if duplicate_direct_count or duplicate_dependent_count: + logging.info( + f"Duplicate entries ignored for {duplicate_direct_count} direct xrefs and {duplicate_dependent_count} dependent xrefs" + ) + + for key, val in err_count.items(): + logging.warning( + f"{val} direct xrefs for database {key} could not be added as their stable_ids could not be found" + ) + + xref_dbi.close() + + self.update_process_status("direct_xrefs_parsed") + + def process_dependents(self, args: Dict[str, Any], dbi: Connection) -> int: + master_xref_ids = args["master_xrefs"] + duplicate_dep_count = args["dup_count"] + table = args["table"] + internal_id = args["internal_id"] + + for master_xref_id in master_xref_ids: + # Get all dependents related to master xref + dep_query = select(DependentXrefUORM.dependent_xref_id).where( + DependentXrefUORM.master_xref_id == master_xref_id + ) + for dep in dbi.execute(dep_query).mappings().all(): + # Add dependent object xref + dep_object_xref_id = self.get_object_xref_id( + internal_id, + dep.dependent_xref_id, + table, + "DEPENDENT", + dbi, + master_xref_id, + ) + if dep_object_xref_id: + duplicate_dep_count += 1 + continue + else: + dep_object_xref_id = self.add_object_xref( + internal_id, + dep.dependent_xref_id, + table, + "DEPENDENT", + dbi, + master_xref_id, + ) + + # Add identity xref + dbi.execute( + insert(IdentityXrefUORM).values( + object_xref_id=dep_object_xref_id, + query_identity=100, + target_identity=100, + ) + ) + + # Get the dependent dependents just in case + master_xref_ids.append(dep.dependent_xref_id) + + return duplicate_dep_count diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py new file mode 100644 index 000000000..a2a543589 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py @@ -0,0 +1,871 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for setting display xrefs in the core DB.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class DisplayXrefs(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + self.mapper(mapper) + mapper.set_up_logging() + + def mapper(self, mapper: BasicMapper = None) -> BasicMapper: + if mapper: + self._mapper = mapper + + return self._mapper + + def build_display_xrefs(self) -> None: + logging.info("Processing display xrefs") + + mapper = self.mapper() + + # Set the display xrefs + if hasattr(mapper, "set_display_xrefs"): + mapper.set_display_xrefs() + else: + set_transcript_display_xrefs = False + if hasattr(mapper, "set_transcript_names"): + set_transcript_display_xrefs = True + self.set_display_xrefs(set_transcript_display_xrefs) + + # Set transcript names + if hasattr(mapper, "set_transcript_names"): + mapper.set_transcript_names() + else: + self.set_transcript_names() + + self.update_process_status("display_xrefs_done") + + # Set the gene descriptions + self.set_gene_descriptions() + + # Set the meta timestamp + self.set_meta_timestamp() + + self.update_process_status("gene_descriptions_done") + + def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: + logging.info("Setting Transcript and Gene display xrefs") + + # Get the xref offset used when adding the xrefs into the core DB + xref_offset = self.get_meta_value("xref_offset") + xref_offset = int(xref_offset) + logging.info(f"Using xref offset of {xref_offset}") + + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + mapper = self.mapper() + + # Reset transcript display xrefs + if set_transcript_display_xrefs: + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=None) + .where(TranslationORM.biotype != "LRG_gene") + ) + + for object_type in ["Gene", "Transcript"]: + if object_type == "Transcript" and not set_transcript_display_xrefs: + continue + precedence_list, ignore = None, None + + # Get name source priorities and ignore queries + method = f"{object_type.lower()}_display_xref_sources" + if hasattr(mapper, method): + precedence_list, ignore = getattr(mapper, method)() + else: + precedence_list, ignore = getattr(self, method)() + + # Add the priorities into the DB + priority = 0 + logging.info(f"Precedence for {object_type} display xrefs (1- best name)") + + for source_name in precedence_list: + priority += 1 + + # Get the source ID + query = ( + select(SourceUORM.source_id, SourceUORM.name) + .where(SourceUORM.name.like(source_name)) + .order_by(SourceUORM.priority) + ) + for row in xref_dbi.execute(query).mappings().all(): + xref_dbi.execute( + insert(DisplayXrefPriorityORM).values( + ensembl_object_type=object_type, + source_id=row.source_id, + priority=priority, + ) + ) + + logging.info(f"{priority} - {row.name}") + + # Execute ignore queries + self._apply_ignore(ignore, xref_dbi) + + object_seen = {} + display_xref_count = 0 + + # Build the case statements + GTTGene = aliased(GeneTranscriptTranslationORM) + GTTTranscript = aliased(GeneTranscriptTranslationORM) + GTTTranslation = aliased(GeneTranscriptTranslationORM) + gene_case_stmt = case( + [ + (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id), + ( + ObjectXrefUORM.ensembl_object_type == "Transcript", + GTTTranscript.gene_id, + ), + ( + ObjectXrefUORM.ensembl_object_type == "Translation", + GTTTranslation.gene_id, + ), + ], + ).label("d_gene_id") + transcript_case_stmt = case( + [ + ( + ObjectXrefUORM.ensembl_object_type == "Gene", + GTTGene.transcript_id, + ), + ( + ObjectXrefUORM.ensembl_object_type == "Transcript", + GTTTranscript.transcript_id, + ), + ( + ObjectXrefUORM.ensembl_object_type == "Translation", + GTTTranslation.transcript_id, + ), + ], + ).label("d_transcript_id") + + # Get all relevent xrefs for this object type based on precendence sources + query = ( + select( + gene_case_stmt, + transcript_case_stmt, + DisplayXrefPriorityORM.priority, + XrefUORM.xref_id, + ) + .join( + SourceUORM, SourceUORM.source_id == DisplayXrefPriorityORM.source_id + ) + .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) + .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id) + .join( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id) + .outerjoin( + GTTTranscript, + GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id, + ) + .outerjoin( + GTTTranslation, + GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id, + ) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + DisplayXrefPriorityORM.ensembl_object_type == object_type, + ) + .order_by( + "d_gene_id", + ObjectXrefUORM.ensembl_object_type, + DisplayXrefPriorityORM.priority, + desc( + IdentityXrefUORM.target_identity + + IdentityXrefUORM.query_identity + ), + ObjectXrefUORM.unused_priority.desc(), + XrefUORM.accession, + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + object_id = None + if object_type == "Gene": + object_id = row.d_gene_id + elif object_type == "Transcript": + object_id = row.d_transcript_id + + # Update the display xrefs + if not object_seen.get(object_id): + xref_id = int(row.xref_id) + if object_type == "Gene": + core_dbi.execute( + update(GeneORM) + .values(display_xref_id=xref_id + xref_offset) + .where( + GeneORM.gene_id == object_id, + GeneORM.display_xref_id == None, + ) + ) + elif object_type == "Transcript": + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=xref_id + xref_offset) + .where(TranscriptORM.transcript_id == object_id) + ) + + display_xref_count += 1 + object_seen[object_id] = 1 + + logging.info(f"Updated {display_xref_count} {object_type} display_xrefs") + + # Reset ignored object xrefs + xref_dbi.execute( + update(ObjectXrefUORM) + .values(ox_status="DUMP_OUT") + .where(ObjectXrefUORM.ox_status == "NO_DISPLAY") + ) + + # Remove synonyms not linked to display xrefs + query = ( + select(XrefCORM.xref_id) + .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id) + .where(GeneORM.display_xref_id == None) + ) + result = core_dbi.execute(query).fetchall() + xref_ids = [row[0] for row in result] + + core_dbi.execute( + delete(ExternalSynonymORM).where(ExternalSynonymORM.xref_id.in_(xref_ids)) + ) + + xref_dbi.close() + core_dbi.close() + + def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "VGNC", + "HGNC", + "MGI", + "RGD", + "ZFIN_ID", + "Xenbase", + "RFAM", + "miRBase", + "EntrezGene", + "Uniprot_gn", + ] + ignore_queries = {} + + # Ignore EntrezGene labels dependent on predicted RefSeqs + MasterXref = aliased(XrefUORM) + DependentXref = aliased(XrefUORM) + MasterSource = aliased(SourceUORM) + DependentSource = aliased(SourceUORM) + + query = select(ObjectXrefUORM.object_xref_id.distinct()).where( + ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, + ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id, + DependentXrefUORM.dependent_xref_id == DependentXref.xref_id, + DependentXrefUORM.master_xref_id == MasterXref.xref_id, + MasterXref.source_id == MasterSource.source_id, + DependentXref.source_id == DependentSource.source_id, + MasterSource.name.like("Refseq%predicted"), + DependentSource.name.like("EntrezGene"), + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + ignore_queries["EntrezGene"] = query + + query = ( + select(ObjectXrefUORM.object_xref_id) + .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) + .join(SourceUORM, SourceUORM.source_id == XrefUORM.source_id) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + XrefUORM.label.regexp_match("^LOC[[:digit:]]+"), + ) + ) + ignore_queries["LOC_prefix"] = query + + return sources_list, ignore_queries + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + return self.gene_display_xref_sources() + + def _apply_ignore(self, ignore_queries: Dict[str, Select], dbi: Connection) -> None: + # Set status to NO_DISPLAY for object_xrefs with a display_label that is just numeric + query = ( + update(ObjectXrefUORM) + .values(ox_status="NO_DISPLAY") + .where( + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.ox_status.like("DUMP_OUT"), + XrefUORM.label.regexp_match("^[0-9]+$"), + ) + ) + dbi.execute(query) + + # Go through ignore queries + for ignore_type, ignore_query in ignore_queries.items(): + # Set status to NO_DISPLAY for ignore results + for row in dbi.execute(ignore_query).mappings().all(): + dbi.execute( + update(ObjectXrefUORM) + .values(ox_status="NO_DISPLAY") + .where(ObjectXrefUORM.object_xref_id == row.object_xref_id) + ) + + def set_transcript_names(self) -> None: + logging.info("Assigning transcript names from gene names") + + core_dbi = self.core().connect() + + # Reset transcript display xrefs + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=None) + .where(TranscriptORM.biotype != "LRG_gene") + ) + + # Get the max xref and object_xref IDs + xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() + xref_id = int(xref_id) + object_xref_id = core_dbi.execute( + select(func.max(ObjectXrefCORM.object_xref_id)) + ).scalar() + object_xref_id = int(object_xref_id) + + # Get all genes with set display_xref_id + query = select( + GeneORM.gene_id, + ExternalDbORM.db_name, + XrefCORM.dbprimary_acc, + XrefCORM.display_label, + XrefCORM.description, + ).where( + GeneORM.display_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == ExternalDbORM.external_db_id, + ) + for row in core_dbi.execute(query).mappings().all(): + ext = 201 + + # Get the ID of transcript name external DB + external_db_id = core_dbi.execute( + select(ExternalDbORM.external_db_id).where( + ExternalDbORM.db_name.like(f"{row.db_name}_trans_name") + ) + ).scalar() + + if not external_db_id: + raise LookupError( + f"No external_db_id found for '{row.db_name}_trans_name'" + ) + + # Get transcripts related to current gene + query = ( + select(TranscriptORM.transcript_id) + .where(TranscriptORM.gene_id == row.gene_id) + .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end) + ) + for transcript_row in core_dbi.execute(query).mappings().all(): + object_xref_id += 1 + + display_label = f"{row.display_label}-{ext}" + + # Check if xref already exists + insert_xref_id = core_dbi.execute( + select(XrefCORM.xref_id).where( + XrefCORM.external_db_id == external_db_id, + XrefCORM.display_label == display_label, + XrefCORM.info_type == "MISC", + ) + ).scalar() + + if not insert_xref_id: + xref_id += 1 + info_text = f"via gene {row.dbprimary_acc}" + + # Insert new xref + core_dbi.execute( + insert(XrefCORM) + .values( + xref_id=xref_id, + external_db_id=external_db_id, + dbprimary_acc=display_label, + display_label=display_label, + version=0, + description=row.description, + info_type="MISC", + info_text=info_text, + ) + .prefix_with("IGNORE") + ) + + insert_xref_id = xref_id + + # Insert object xref + core_dbi.execute( + insert(ObjectXrefCORM).values( + object_xref_id=object_xref_id, + ensembl_id=transcript_row.transcript_id, + ensembl_object_type="Transcript", + xref_id=insert_xref_id, + ) + ) + + # Set transcript dispay xref + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=insert_xref_id) + .where(TranscriptORM.transcript_id == transcript_row.transcript_id) + ) + + ext += 1 + + # Delete object xrefs with no matching xref + query = ( + select(ObjectXrefCORM.object_xref_id) + .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id) + .where(XrefCORM.xref_id == None) + ) + result = core_dbi.execute(query).fetchall() + object_xref_ids = [row[0] for row in result] + + core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.object_xref_id.in_(object_xref_ids) + ) + ) + + core_dbi.close() + + def set_gene_descriptions(self) -> None: + logging.info("Setting gene descriptions") + + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + mapper = self.mapper() + + # Reset the gene descriptions + core_dbi.execute(update(GeneORM).values(description=None)) + + # Get external display names + name_to_external_name = {} + query = select( + ExternalDbORM.external_db_id, + ExternalDbORM.db_name, + ExternalDbORM.db_display_name, + ) + for row in core_dbi.execute(query).mappings().all(): + name_to_external_name[row.db_name] = row.db_display_name + + # Get source ID to external names mappings + if hasattr(mapper, "set_source_id_to_external_name"): + source_id_to_external_name, name_to_source_id = ( + mapper.set_source_id_to_external_name(name_to_external_name, xref_dbi) + ) + else: + source_id_to_external_name, name_to_source_id = ( + self.set_source_id_to_external_name(name_to_external_name, xref_dbi) + ) + + # Get description source priorities and ignore queries + if hasattr(mapper, "gene_description_sources"): + precedence_list = mapper.gene_description_sources() + ignore = None + else: + precedence_list, ignore = self.gene_description_sources() + + # Get description regular expressions + if hasattr(mapper, "gene_description_filter_regexps"): + reg_exps = mapper.gene_description_filter_regexps() + else: + reg_exps = self.gene_description_filter_regexps() + + # Add the description priorities into the DB + priority = 0 + logging.info("Precedence for Gene descriptions (1- best description)") + + for source_name in precedence_list: + priority += 1 + + # Get the source ID + query = select(SourceUORM.source_id, SourceUORM.name).where( + SourceUORM.name.like(source_name) + ) + for row in xref_dbi.execute(query).mappings().all(): + xref_dbi.execute( + insert(GeneDescPriorityORM) + .values(source_id=row.source_id, priority=priority) + .prefix_with("IGNORE") + ) + + logging.info(f"{priority} - {row.name}") + + # Execute ignore queries + self._apply_ignore(ignore, xref_dbi) + + no_source_name_in_desc = {} + if hasattr(mapper, "no_source_label_list"): + for source_name in mapper.no_source_label_list(): + source_id = name_to_source_id.get(source_name) + if source_id: + logging.info( + f"Source '{name}' will not have [Source:...] info in description" + ) + no_source_name_in_desc[source_id] = 1 + + gene_desc_updated = {} + + # Build the case statement + GTTGene = aliased(GeneTranscriptTranslationORM) + GTTTranscript = aliased(GeneTranscriptTranslationORM) + GTTTranslation = aliased(GeneTranscriptTranslationORM) + gene_case_stmt = case( + [ + (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id), + ( + ObjectXrefUORM.ensembl_object_type == "Transcript", + GTTTranscript.gene_id, + ), + ( + ObjectXrefUORM.ensembl_object_type == "Translation", + GTTTranslation.gene_id, + ), + ], + ).label("d_gene_id") + + # Get all relevent xrefs for this object type based on precendence sources + query = ( + select( + gene_case_stmt, + XrefUORM.description, + SourceUORM.source_id, + XrefUORM.accession, + GeneDescPriorityORM.priority, + ) + .join(SourceUORM, SourceUORM.source_id == GeneDescPriorityORM.source_id) + .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) + .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id) + .join( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id) + .outerjoin( + GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id + ) + .outerjoin( + GTTTranslation, + GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id, + ) + .where(ObjectXrefUORM.ox_status == "DUMP_OUT") + .order_by( + "d_gene_id", + ObjectXrefUORM.ensembl_object_type, + GeneDescPriorityORM.priority, + desc( + IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity + ), + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + if gene_desc_updated.get(row.d_gene_id): + continue + + if row.description: + # Apply regular expressions to description + filtered_description = self.filter_by_regexp(row.description, reg_exps) + if filtered_description != "": + source_name = source_id_to_external_name.get(row.source_id) + filtered_description += ( + f" [Source:{source_name};Acc:{row.accession}]" + ) + + # Update the gene description + core_dbi.execute( + update(GeneORM) + .values(description=filtered_description) + .where( + GeneORM.gene_id == row.d_gene_id, GeneORM.description == None + ) + ) + + gene_desc_updated[row.d_gene_id] = 1 + + logging.info(f"{len(gene_desc_updated.keys())} gene descriptions added") + + # Reset ignored object xrefs + xref_dbi.execute( + update(ObjectXrefUORM) + .values(ox_status="DUMP_OUT") + .where(ObjectXrefUORM.ox_status == "NO_DISPLAY") + ) + + xref_dbi.close() + core_dbi.close() + + def get_external_name_mappings(self, core_dbi: Connection, xref_dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]: + # Get external display names + external_name_to_display_name = {} + query = select( + ExternalDbORM.external_db_id, + ExternalDbORM.db_name, + ExternalDbORM.db_display_name, + ) + for row in core_dbi.execute(query).mappings().all(): + external_name_to_display_name[row.db_name] = row.db_display_name + + # Get sources for available xrefs + source_id_to_external_name, source_name_to_source_id = {}, {} + query = ( + select(SourceUORM.source_id, SourceUORM.name) + .where(SourceUORM.source_id == XrefUORM.source_id) + .group_by(SourceUORM.source_id) + ) + for row in xref_dbi.execute(query).mappings().all(): + if external_name_to_display_name.get(row.name): + source_id_to_external_name[row.source_id] = external_name_to_display_name[row.name] + source_name_to_source_id[row.name] = row.source_id + elif re.search(r"notransfer$", row.name): + logging.info(f"Ignoring notransfer source '{row.name}'") + else: + raise LookupError(f"Could not find {row.name} in external_db table") + + return source_id_to_external_name, source_name_to_source_id + + def set_source_id_to_external_name(self, name_to_external_name: Dict[str, str], dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]: + source_id_to_external_name, name_to_source_id = {}, {} + + # Get sources for available xrefs + query = ( + select(SourceUORM.source_id, SourceUORM.name) + .where(SourceUORM.source_id == XrefUORM.source_id) + .group_by(SourceUORM.source_id) + ) + for row in dbi.execute(query).mappings().all(): + if name_to_external_name.get(row.name): + source_id_to_external_name[row.source_id] = name_to_external_name[row.name] + name_to_source_id[row.name] = row.source_id + elif re.search(r"notransfer$", row.name): + logging.info(f"Ignoring notransfer source '{row.name}'") + else: + raise LookupError(f"Could not find {row.name} in external_db table") + + return source_id_to_external_name, name_to_source_id + + def gene_description_sources(self) -> Tuple[List[str], Dict[str, Select]]: + return self.gene_display_xref_sources() + + def gene_description_filter_regexps(self) -> List[str]: + regex = [ + r"[0-9A-Z]{10}RIK PROTEIN[ \.]", + r"\(?[0-9A-Z]{10}RIK PROTEIN\)?[ \.]", + r"^BA\S+\s+\(NOVEL PROTEIN\)\.?", + r"^BC\d+\_\d+\.?", + r"CDNA SEQUENCE\s?,? [A-Z]+\d+[ \.;]", + r"^CGI\-\d+ PROTEIN\.?\;?", + r"^CHROMOSOME\s+\d+\s+OPEN\s+READING\s+FRAME\s+\d+\.?.*", + r"CLONE MGC:\d+[ \.;]", + r"^\(CLONE REM\d+\) ORF \(FRAGMENT\)\.*", + r"\(CLONE \S+\)\s+", + r"^DJ\S+\s+\(NOVEL PROTEIN\)\.?", + r"^DKFZP[A-Z0-9]+\s+PROTEIN[\.;]?.*", + r"DNA SEGMENT, CHR.*", + r"EST [A-Z]+\d+[ \.;]", + r"EXPRESSED SEQUENCE [A-Z]+\d+[ \.;]", + r"^FKSG\d+\.?.*", + r"^FLJ\d+\s+PROTEIN.*", + r"^HSPC\d+.*", + r"^HSPC\d+\s+PROTEIN\.?.*", + r"HYPOTHETICAL PROTEIN,", + r"HYPOTHETICAL PROTEIN \S+[\.;]", + r"^\(*HYPOTHETICAL\s+.*", + r"\(*HYPOTHETICAL\s+.*", + r"^KIAA\d+\s+GENE\s+PRODUCT\.?.*", + r"^KIAA\d+\s+PROTEIN\.?.*", + r"^LOC\d+\s*(PROTEIN)?\.?", + r" MGC:\s*\d+[ \.;]", + r"MGC:\s*\d+[ \.;]", + r"^ORF.*", + r"^ORF\s*\d+\s+PROTEIN\.*", + r"^PRED\d+\s+PROTEIN.*", + r"^PRO\d+\.?.*", + r"^PRO\d+\s+PROTEIN\.?.*", + r"^PROTEIN C\d+ORF\d+\.*", + r"PROTEIN KIAA\d+[ \.].*", + r"PROTEIN \S+ HOMOLOG\.?", + r"^Putative uncharacterized protein.*", + r"R\d{5}_\d[ \.,].*", + r"RIKEN CDNA [0-9A-Z]{10}[ \.;]", + r"RIKEN CDNA [0-9A-Z]{10}[ \.]", + r".*RIKEN FULL-LENGTH ENRICHED LIBRARY.*", + r".*RIKEN FULL-LENGTH ENRICHED LIBRARY.*PRODUCT:", + r"^\s*\(\d*\)\s*[ \.]$", + r"^\s*\(\d*\)\s*[ \.]$", + r"^\s*\(?FRAGMENT\)?\.?\s*$", + r"^\s*\(FRAGMENT\)\.?\s*$", + r"\s*\(?GENE\)?\.?;?", + r"^\s*\(?GENE\)?\.?;?\s*$", + r"^\s*\(?GENE\)?\.?\s*$", + r"SIMILAR TO GENBANK ACCESSION NUMBER\s+\S+", + r"^SIMILAR TO GENE.*", + r"^SIMILAR TO HYPOTHETICAL.*", + r"^SIMILAR TO (KIAA|LOC).*", + r"SIMILAR TO (KIAA|LOC|RIKEN).*", + r"^SIMILAR TO PUTATIVE[ \.]", + r"SIMILAR TO PUTATIVE[ \.]", + r"^SIMILAR TO\s+$", + r"SIMILAR TO\s+$", + r"\s*\(?PRECURSOR\)?\.?;?", + r"^\s*\(?PROTEIN\)?\.?\s*$", + r"^\s+\(?\s*$", + r"^\s*\(\s*\)\s*$", + r"^UNKNOWN\s+.*", + r"^WUGSC:H_.*", + r"^WUGSC:.*\s+PROTEIN\.?.*", + ] + + return regex + + def filter_by_regexp(self, string: str, regular_expressions: List[str]) -> str: + for regex in regular_expressions: + string = re.sub(regex, "", string, flags=re.IGNORECASE) + + return string + + def set_meta_timestamp(self) -> None: + with self.core().connect() as dbi: + dbi.execute(delete(MetaCORM).where(MetaCORM.meta_key == "xref.timestamp")) + + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + dbi.execute( + insert(MetaCORM).values(meta_key="xref.timestamp", meta_value=now) + ) + + def set_display_xrefs_from_stable_table(self) -> None: + logging.info("Setting Transcript and Gene display xrefs using stable IDs") + + # Get the xref offset used when adding the xrefs into the core DB + xref_offset = self.get_meta_value("xref_offset") + xref_offset = int(xref_offset) + logging.info(f"Using xref offset of {xref_offset}") + + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + # Reset gene and transcript display xrefs + core_dbi.execute(update(GeneORM).values(display_xref_id=None)) + core_dbi.execute(update(TranscriptORM).values(display_xref_id=None)) + + # Remove descriptions with 'Source' field + core_dbi.execute( + update(GeneORM) + .values(description=None) + .where(GeneORM.description.like("%[Source:%]%")) + ) + + # Get external names and IDs + name_to_external_name, source_id_to_external_name = {}, {} + query = select( + ExternalDbORM.external_db_id, + ExternalDbORM.db_name, + ExternalDbORM.db_display_name, + ) + for row in core_dbi.execute(query).mappings().all(): + name_to_external_name[row.db_name] = row.db_display_name + + query = ( + select(SourceUORM.source_id, SourceUORM.name) + .where(SourceUORM.source_id == XrefUORM.source_id) + .group_by(SourceUORM.source_id) + ) + for row in xref_dbi.execute(query).mappings().all(): + if name_to_external_name.get(row.name): + source_id_to_external_name[row.source_id] = name_to_external_name[ + row.name + ] + + gene_count = 0 + + # Set gene names and descriptions + query = select( + GeneStableIdORM.internal_id, + GeneStableIdORM.display_xref_id, + XrefUORM.description, + XrefUORM.source_id, + XrefUORM.accession, + ).where(GeneStableIdORM.display_xref_id == XrefUORM.xref_id) + for row in xref_dbi.execute(query).mappings().all(): + xref_id = int(row.display_xref_id) + + # Set display xref ID + core_dbi.execute( + update(GeneORM) + .values(display_xref_id=(xref_id + xref_offset)) + .where(GeneORM.gene_id == row.internal_id) + ) + + # Set description + if row.description is not None and row.description != "": + description = f"{row.description} [Source:{source_id_to_external_name[row.source_id]};Acc:{row.accession}]" + core_dbi.execute( + update(GeneORM) + .values(description=description) + .where(GeneORM.gene_id == row.internal_id) + ) + + xref_dbi.execute( + update(GeneStableIdORM) + .values(desc_set=1) + .where(GeneStableIdORM.internal_id == row.internal_id) + ) + gene_count += 1 + + logging.info(f"{gene_count} gene descriptions added") + + # Set transcript names and descriptions + query = select( + TranscriptStableIdORM.internal_id, TranscriptStableIdORM.display_xref_id + ) + for row in xref_dbi.execute(query).mappings().all(): + xref_id = int(row.display_xref_id) + + if xref_id: + # Set display xref ID + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=(xref_id + xref_offset)) + .where(TranscriptORM.transcript_id == row.internal_id) + ) + + # Clean up synonyms linked to xrefs which are not display xrefs + query = ( + select(ExternalSynonymORM) + .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id) + .where( + ExternalSynonymORM.xref_id == XrefCORM.xref_id, + GeneORM.display_xref_id == None, + ) + ) + for row in core_dbi.execute(query).mappings().all(): + core_dbi.execute( + delete(ExternalSynonymORM).where( + ExternalSynonymORM.xref_id == row.xref_id, + ExternalSynonymORM.synonym == row.synonym, + ) + ) + + xref_dbi.close() + core_dbi.close() diff --git a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py new file mode 100644 index 000000000..e4c33bf75 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py @@ -0,0 +1,637 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for setting the feature names.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class OfficialNaming(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + self._official_name = mapper.official_name() + mapper.set_up_logging() + + def official_name(self, official_name: str = None) -> str: + if official_name: + self._official_name = official_name + + return self._official_name + + def run(self, species_id: int, verbose: bool) -> None: + logging.info("Starting official naming") + + # If no offical name then we do not want to go any further + dbname = self.official_name() + if not dbname: + self.update_process_status("official_naming_done") + return + + xref_dbi = self.xref().connect() + + # If there are any official names on transcripts or translations, move them onto gene level + if dbname == "MGI": + self.biomart_fix("MGI", "Translation", "Gene", xref_dbi) + self.biomart_fix("MGI", "Transcript", "Gene", xref_dbi) + if dbname == "ZFIN_ID": + self.biomart_fix("ZFIN_ID", "Translation", "Gene", xref_dbi) + self.biomart_fix("ZFIN_ID", "Transcript", "Gene", xref_dbi) + if dbname == "RGD": + self.biomart_fix("RGD", "Translation", "Gene", xref_dbi) + self.biomart_fix("RGD", "Transcript", "Gene", xref_dbi) + + # Get the current max values for xref and object_xref + max_xref_id = xref_dbi.execute(select(func.max(XrefUORM.xref_id))).scalar() + max_xref_id = int(max_xref_id) + max_object_xref_id = xref_dbi.execute( + select(func.max(ObjectXrefUORM.object_xref_id)) + ).scalar() + max_object_xref_id = int(max_object_xref_id) + + # Get labels, descriptions, and synonyms + display_label_to_desc = self.get_display_label_data(dbname, xref_dbi) + synonyms = self.get_synonyms(dbname, xref_dbi) + + # Get source IDs + dbname_to_source_id = self.get_dbname_to_source_id(dbname, xref_id) + + # Reset gene and transcript stable id display data + self.reset_display_xrefs(xref_dbi) + + # Get the gene and transcript stable IDs and internal IDs + gene_to_transcripts, gene_id_to_stable_id, tran_id_to_stable_id = {}, {}, {} + sorted_gene_ids = [] + + query = ( + select( + GeneTranscriptTranslationORM.gene_id, + GeneTranscriptTranslationORM.transcript_id, + GeneStableIdORM.stable_id.label("gene_stable_id"), + TranscriptStableIdORM.stable_id.label("transcript_stable_id"), + ) + .where( + GeneTranscriptTranslationORM.gene_id == GeneStableIdORM.internal_id, + GeneTranscriptTranslationORM.transcript_id + == TranscriptStableIdORM.internal_id, + ) + .order_by(GeneStableIdORM.stable_id, TranscriptStableIdORM.stable_id) + ) + for row in xref_dbi.execute(query).mappings().all(): + if not gene_to_transcripts.get(row.gene_id): + sorted_gene_ids.append(row.gene_id) + + gene_to_transcripts.setdefault(row.gene_id, []).append(row.transcript_id) + gene_id_to_stable_id[row.gene_id] = row.gene_stable_id + tran_id_to_stable_id[row.transcript_id] = row.transcript_stable_id + + # Get the object xref IDs that we should ignore (EntrezGene xref dependent on RefSeq_predicted xrefs) + ignore_object = {} + + MasterXref = aliased(XrefUORM) + DependentXref = aliased(XrefUORM) + + MasterSource = aliased(SourceUORM) + DependentSource = aliased(SourceUORM) + + query = select(ObjectXrefUORM.object_xref_id.distinct()).where( + ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, + DependentXrefUORM.dependent_xref_id == DependentXref.xref_id, + DependentXrefUORM.master_xref_id == MasterXref.xref_id, + MasterXref.source_id == MasterSource.source_id, + DependentXref.source_id == DependentSource.source_id, + MasterSource.name.like("Refseq%predicted"), + DependentSource.name.like("EntrezGene"), + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + for row in xref_dbi.execute(query).mappings().all(): + ignore_object[row.object_xref_id] = 1 + + xref_added, seen_gene, official_name_used = {}, {}, {} + + # Go through all genes + for gene_id in sorted_gene_ids: + transcript_source = dbname + gene_symbol, gene_symbol_xref_id, is_lrg = None, None, 0 + + # Get offical name if it has one + gene_symbol, gene_symbol_xref_id = self.get_official_domain_name( + { + "gene_id": gene_id, + "gene_id_to_stable_id": gene_id_to_stable_id, + "official_name_used": official_name_used, + "dbname": dbname, + "verbose": verbose, + }, + xref_dbi, + ) + + if gene_symbol_xref_id: + official_name_used[gene_symbol_xref_id] = 1 + + # If not found see if there is an LRG entry + if not gene_symbol: + gene_symbol, gene_symbol_xref_id, is_lrg = self.find_lrg_hgnc( + gene_id, xref_dbi + ) + + # If not found look for other valid database sources (RFAM and miRBase, EntrezGene) + if not gene_symbol: + ( + gene_symbol, + gene_symbol_xref_id, + transcript_source, + display_label_to_desc, + ) = self.find_from_other_sources( + ignore_object, + { + "gene_id": gene_id, + "display_label_to_desc": display_label_to_desc, + "transcript_source": transcript_source, + }, + xref_dbi, + ) + + if gene_symbol: + description = display_label_to_desc.get(gene_symbol) + xref_dbi.execute( + update(GeneStableIdORM) + .where(GeneStableIdORM.internal_id == gene_id) + .values(display_xref_id=gene_symbol_xref_id) + ) + + if not is_lrg: + # Set transcript names + max_xref_id, max_object_xref_id, xref_added, seen_gene = ( + self.set_transcript_display_xrefs( + { + "max_xref_id": max_xref_id, + "max_object_xref_id": max_object_xref_id, + "gene_id": gene_id, + "gene_id_to_stable_id": gene_id_to_stable_id, + "gene_symbol": gene_symbol, + "description": description, + "source_id": dbname_to_source_id.get( + f"{transcript_source}_trans_name" + ), + "xref_added": xref_added, + "seen_gene": seen_gene, + "transcript_ids": gene_to_transcripts.get(gene_id, []), + "transcript_source": transcript_source, + "species_id": species_id, + }, + xref_dbi, + ) + ) + + xref_dbi.close() + + self.update_process_status("official_naming_done") + + def get_display_label_data(self, dbname: str, dbi: Connection) -> Dict[str, str]: + label_to_desc = {} + + # Connect synonyms to xref descriptions + query = select(SynonymORM.synonym, XrefUORM.description).where( + XrefUORM.xref_id == SynonymORM.xref_id, + SourceUORM.source_id == XrefUORM.source_id, + SourceUORM.name.like(dbname), + ) + for row in dbi.execute(query).mappings().all(): + label_to_desc[row.synonym] = row.description + + # Connect display labels to xref descriptions + no_descriptions = 0 + query = select(XrefUORM.label, XrefUORM.description).where( + XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name.like(dbname) + ) + for row in dbi.execute(query).mappings().all(): + if not row.description: + no_descriptions += 1 + else: + label_to_desc[row.label] = row.description + + if no_descriptions: + logging.warn(f"Descriptions not defined for {no_descriptions} labels") + + return label_to_desc + + def get_synonyms(self, dbname: str, dbi: Connection) -> Dict[str, str]: + synonyms = {} + + # Connect synonyms with xref labels + query = select(SynonymORM.synonym, XrefUORM.label).where( + XrefUORM.xref_id == SynonymORM.xref_id, + SourceUORM.source_id == XrefUORM.source_id, + SourceUORM.name.like(dbname), + ) + for row in dbi.execute(query).mappings().all(): + synonyms[row.synonym] = row.label + + return synonyms + + def get_dbname_to_source_id(self, dbname: str, dbi: Connection) -> Dict[str, int]: + dbname_to_source_id = {} + + sources_list = [ + "RFAM_trans_name", + "miRBase_trans_name", + "EntrezGene_trans_name", + ] + sources_list.append(f"{dbname}_trans_name") + sources_list.append(dbname) + + source_error = 0 + for source_name in sources_list: + source_id = dbi.execute( + select(SourceUORM.source_id).where(SourceUORM.name.like(source_name)) + ).scalar() + + if not source_id: + logging.warn(f"Could not find external database '{source_name}'") + source_error += 1 + else: + dbname_to_source_id[source_name] = source_id + + if source_error: + raise LookupError( + f"Could not find name for {source_error} databases. Therefore Exiting. Please add these sources" + ) + + return dbname_to_source_id + + def reset_display_xrefs(self, dbi: Connection) -> None: + dbi.execute(update(TranscriptStableIdORM).values(display_xref_id=None)) + + dbi.execute(update(GeneStableIdORM).values(display_xref_id=None, desc_set=0)) + + def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tuple[str, int]: + gene_id = args["gene_id"] + gene_id_to_stable_id = args["gene_id_to_stable_id"] + official_name_used = args["official_name_used"] + dbname = args["dbname"] + verbose = args["verbose"] + + gene_symbol, gene_symbol_xref_id = None, None + display_names, xref_id_to_display = {}, {} + best_level, name_count = 999, 0 + xref_ids_list, object_xref_ids_list = [], [] + + # Get the display labels mapped to the gene ID, and extract the ones with the highest priority + query = select( + XrefUORM.label, + XrefUORM.xref_id, + ObjectXrefUORM.object_xref_id, + SourceUORM.priority, + ).where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name == dbname, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ) + for row in dbi.execute(query).mappings().all(): + xref_ids_list.append(row.xref_id) + object_xref_ids_list.append(row.object_xref_id) + xref_id_to_display[row.xref_id] = row.label + + name_count += 1 + + if row.priority < best_level: + display_names.clear() + display_names[row.xref_id] = 1 + best_level = row.priority + elif row.priority == best_level: + display_names[row.xref_id] = 1 + + # Check if the best names has been found, and remove the others if so + if name_count > 1 and len(display_names) == 1: + if verbose: + logging.info( + f"For gene {gene_id_to_stable_id[gene_id]}, we have multiple {dbname} names" + ) + + gene_symbol, gene_symbol_xref_id = self.set_the_best_display_name( + display_names, + xref_ids_list, + object_xref_ids_list, + xref_id_to_display, + verbose, + dbi, + ) + if gene_symbol: + return gene_symbol, gene_symbol_xref_id + + # Perfect case, one best name found + if len(display_names) == 1: + xref_id = display_names.keys()[0] + return xref_id_to_display[xref_id], xref_id + + # Try to find the best names out of multiple ones + if len(display_names) > 1: + temp_best_identity = 0 + best_ids, best_list = [], [] + + # Fail xrefs with worse % identity if we can (query or target identity whichever is greater) + case_stmt = case( + [ + ( + IdentityXrefUORM.query_identity + >= IdentityXrefUORM.target_identity, + IdentityXrefUORM.query_identity, + ) + ], + else_=IdentityXrefUORM.target_identity, + ).label("best_identity") + query = ( + select(XrefUORM.xref_id, case_stmt) + .where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, + SourceUORM.name == dbname, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ) + .order_by(desc("best_identity")) + ) + for row in dbi.execute(query).mappings().all(): + if row.best_identity > temp_best_identity: + best_ids.clear() + best_ids[row.xref_id] = 1 + temp_best_identity = row.best_identity + elif row.best_identity == temp_best_identity: + best_ids[row.xref_id] = 1 + else: + break + + for xref_id in display_names.keys(): + best_list[xref_id_to_display[xref_id]] = 1 + + # Check if we were able to reduce the number of xrefs based on % identity + if len(best_ids) > 0 and len(best_ids) < len(display_names): + display_names = best_ids + if verbose: + logging.info( + f"For gene {gene_id_to_stable_id[gene_id]}, we have multiple {dbname} names" + ) + + gene_symbol, gene_symbol_xref_id = self.set_the_best_display_name( + display_names, + xref_ids_list, + object_xref_ids_list, + xref_id_to_display, + verbose, + dbi, + ) + if gene_symbol and len(display_names) == 1: + return gene_symbol, gene_symbol_xref_id + + # Take the name which hasn't been already assigned to another gene, if possible + xref_not_used = None + for xref_id in display_names.keys(): + if not official_name_used.get(xref_id): + xref_not_used = xref_id + + if xref_not_used: + if verbose: + logging.info(f"For gene {gene_id_to_stable_id[gene_id]}:") + for xref_id in display_names.keys(): + if xref_id == xref_not_used: + if verbose: + logging.info(f"\t{xref_id_to_display[xref_id]} chosen") + gene_symbol = xref_id_to_display[xref_id] + gene_symbol_xref_id = xref_id + else: + if verbose: + logging.info( + f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)" + ) + else: + index = 0 + for xref_id in display_names.keys(): + if not index: + if verbose: + logging.info( + f"\t{xref_id_to_display[xref_id]} chosen as first" + ) + gene_symbol = xref_id_to_display[xref_id] + gene_symbol_xref_id = xref_id + else: + if verbose: + logging.info( + f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)" + ) + index += 1 + + return gene_symbol, gene_symbol_xref_id + + def set_the_best_display_name(self, display_names: Dict[int, int], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]: + gene_symbol, gene_symbol_xref_id = None, None + + for xref_id in xref_list: + # Remove object xrefs that are not in the best display names list + if not display_names.get(xref_id): + if verbose: + logging.info(f"Removing {xref_id_to_display[xref_id]} from gene") + self.update_object_xref_status( + object_xref_list[xref_id], "MULTI_DELETE", dbi + ) + else: + if verbose: + logging.info(f"Keeping the best one {xref_id_to_display[xref_id]}") + gene_symbol = xref_id_to_display[xref_id] + gene_symbol_xref_id = xref_id + + return gene_symbol, gene_symbol_xref_id + + def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]: + gene_symbol, gene_symbol_xref_id = None, None + is_lrg = False + + # Look for LRG_HGNC_notransfer, if found then find HGNC equiv and set to this + query = select( + XrefUORM.label, + XrefUORM.xref_id, + ObjectXrefUORM.object_xref_id, + SourceUORM.priority, + ).where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name == "LRG_HGNC_notransfer", + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ) + for row in dbi.execute(query).mappings().all(): + # Set status to NO_DISPLAY as we do not want this transferred, just the equivalent hgnc + self.update_object_xref_status(row.object_xref_id, "NO_DISPLAY") + + new_xref_id, priority = None, None + query = ( + select(XrefUORM.xref_id, SourceUORM.priority) + .where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + XrefUORM.label == row.label, + SourceUORM.name == "HGNC", + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + .order_by(SourceUORM.priority) + ) + result = dbi.execute(query).fetchall() + if result: + new_xref_id, priority = result[0] + + if new_xref_id: + gene_symbol = row.label + gene_symbol_xref_id = new_xref_id + is_lrg = True + + return gene_symbol, gene_symbol_xref_id, is_lrg + + def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any], dbi: Connection) -> Tuple[str, int, str, Dict[str, str]]: + gene_id = args["gene_id"] + display_label_to_desc = args["display_label_to_desc"] + transcript_source = args["transcript_source"] + + gene_symbol, gene_symbol_xref_id = None, None + other_name_number, found_gene = {}, {} + + for dbname in ["miRBase", "RFAM", "EntrezGene"]: + query = select( + XrefUORM.label, + XrefUORM.xref_id, + ObjectXrefUORM.object_xref_id, + XrefUORM.description, + ).where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name == dbname, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ) + for row in dbi.execute(query).mappings().all(): + if found_gene.get(gene_id): + break + if re.search(r"^LOC", row.label) or re.search(r"^SSC", row.label): + continue + if ignore.get(row.object_xref_id): + continue + + gene_symbol = row.label + gene_symbol_xref_id = row.xref_id + transcript_source = dbname + display_label_to_desc[row.label] = row.description + + if other_name_number.get(gene_symbol): + other_name_number[gene_symbol] += 1 + else: + other_name_number[gene_symbol] = 1 + + if dbname != "EntrezGene": + gene_symbol = f"{gene_symbol}.{other_name_number[gene_symbol]}" + + found_gene[gene_id] = 1 + + return gene_symbol, gene_symbol_xref_id, transcript_source, display_label_to_desc + + def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> Tuple[int, int, Dict[str, int], Dict[str, int]]: + max_xref_id = args["max_xref_id"] + max_object_xref_id = args["max_object_xref_id"] + gene_id = args["gene_id"] + gene_id_to_stable_id = args["gene_id_to_stable_id"] + gene_symbol = args["gene_symbol"] + description = args["description"] + source_id = args["source_id"] + xref_added = args["xref_added"] + seen_gene = args["seen_gene"] + transcript_ids = args["transcript_ids"] + transcript_source = args["transcript_source"] + species_id = args["species_id"] + + # Do nothing is LRG + if re.search("LRG", gene_id_to_stable_id.get(gene_id)): + return + + ext = 201 + if seen_gene.get(gene_symbol): + ext = seen_gene[gene_symbol] + + # Go thourgh transcripts + for transcript_id in transcript_ids: + transcript_name = f"{gene_symbol}-{ext}" + + if not source_id: + raise LookupError( + f"transcript_name = {transcript_name} for transcript_id {transcript_id} but NO source_id for this entry for {transcript_source}???" + ) + + index = f"{transcript_name}:{source_id}" + if not xref_added.get(index): + # Add new xref for the transcript name + max_xref_id += 1 + dbi.execute( + insert(XrefUORM) + .values( + xref_id=max_xref_id, + source_id=source_id, + accession=transcript_name, + label=transcript_name, + version=0, + species_id=species_id, + info_type="MISC", + info_text="", + description=description, + ) + .prefix_with("IGNORE") + ) + + xref_added[index] = max_xref_id + + # Update the transcript display xref + dbi.execute( + update(TranscriptStableIdORM) + .where(TranscriptStableIdORM.internal_id == transcript_id) + .values(display_xref_id=xref_added[index]) + ) + + # Add a corresponding object and identity xrefs + max_object_xref_id += 1 + dbi.execute( + insert(ObjectXrefUORM).values( + object_xref_id=max_object_xref_id, + ensembl_id=transcript_id, + ensembl_object_type="Transcript", + xref_id=xref_added[index], + linkage_type="MISC", + ox_status="DUMP_OUT", + ) + ) + + dbi.execute( + insert(IdentityXrefUORM).values( + object_xref_id=max_object_xref_id, + query_identity=100, + target_identity=100, + ) + ) + + ext += 1 + + seen_gene[gene_symbol] = ext + + return max_xref_id, max_object_xref_id, xref_added, seen_gene diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py new file mode 100644 index 000000000..53832520c --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py @@ -0,0 +1,382 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing sequence matched xref data.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class ProcessMappings(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def process_mappings(self) -> None: + xref_dbi = self.xref().connect() + + query_cutoff, target_cutoff = {}, {} + + # Get cutoffs per mapping job + mapping_query = select( + MappingORM.job_id, + MappingORM.percent_query_cutoff, + MappingORM.percent_target_cutoff, + ) + for mapping in xref_dbi.execute(mapping_query).mappings().all(): + query_cutoff[mapping.job_id] = mapping.percent_query_cutoff + target_cutoff[mapping.job_id] = mapping.percent_target_cutoff + + already_processed_count, processed_count, error_count, empty_count = 0, 0, 0, 0 + + # Go through mapping jobs + mapping_query = select( + MappingJobsORM.root_dir, + MappingJobsORM.map_file, + MappingJobsORM.status, + MappingJobsORM.out_file, + MappingJobsORM.err_file, + MappingJobsORM.array_number, + MappingJobsORM.job_id, + ) + for mapping_job in xref_dbi.execute(mapping_query).mappings().all(): + root_dir = mapping_job.root_dir + if root_dir is None: + root_dir = "" + + err_file = os.path.join(root_dir, mapping_job.err_file) + out_file = os.path.join(root_dir, mapping_job.out_file) + map_file = os.path.join(root_dir, mapping_job.map_file) + + update_status = None + + if mapping_job.status == "SUCCESS": + already_processed_count += 1 + else: + if os.path.exists(err_file) and os.path.getsize(err_file) > 0: + error_count += 1 + + # Display errors on STDERR + logging.warning(f"Problem {err_file} is non zero") + try: + with open(err_file) as fh: + for line in fh: + logging.warning(f"#{line}") + except: + logging.debug( + f"No error file exists {err_file}???\n Resubmit this job" + ) + + if mapping_job.status == "SUBMITTED": + update_status = "FAILED" + else: + # Process the mapping file + if os.path.exists(map_file): + count = self.process_map_file( + map_file, + query_cutoff[mapping_job.job_id], + target_cutoff[mapping_job.job_id], + mapping_job.job_id, + mapping_job.array_number, + xref_dbi, + ) + if count > 0: + processed_count += 1 + update_status = "SUCCESS" + elif count == 0: + processed_count += 1 + empty_count += 1 + update_status = "SUCCESS" + else: + error_count += 1 + update_status = "FAILED" + else: + error_count += 1 + logging.debug( + f"Could not open map file {map_file}???\n Resubmit this job" + ) + update_status = "FAILED" + + # Update mapping job status + if update_status: + xref_dbi.execute( + update(MappingJobsORM) + .where( + MappingJobsORM.job_id == mapping_job.job_id, + MappingJobsORM.array_number == mapping_job.array_number, + ) + .values(status=update_status) + ) + + logging.info( + f"Already processed = {already_processed_count}, processed = {processed_count}, errors = {error_count}, empty = {empty_count}" + ) + + xref_dbi.close() + + if not error_count: + self.update_process_status("mapping_processed") + + def process_map_file(self, map_file: str, query_cutoff: int, target_cutoff: int, job_id: int, array_number: int, dbi: Connection) -> int: + ensembl_type = "Translation" + if re.search("dna_", map_file): + ensembl_type = "Transcript" + + # Get max object xref id + object_xref_id = dbi.execute( + select(func.max(ObjectXrefUORM.object_xref_id)) + ).scalar() + if not object_xref_id: + object_xref_id = 0 + + total_lines, last_query_id = 0, 0 + best_match_found, best_identity, best_score = 0, 0, 0 + first = 1 + + mRNA_biotypes = { + "protein_coding": 1, + "TR_C_gene": 1, + "IG_V_gene": 1, + "nonsense_mediated_decay": 1, + "polymorphic_pseudogene": 1, + } + + try: + mh = open(map_file) + except: + logging.debug(f"Could not open map file {map_file}\n Resubmit this job") + return -1 + + for line in mh: + load_object_xref = 0 + total_lines += 1 + + ( + label, + query_id, + target_id, + identity, + query_length, + target_length, + query_start, + query_end, + target_start, + target_end, + cigar_line, + score, + ) = line.strip().split(":") + + # Fix varibale types (for integer comparisons) + identity = int(identity) + score = int(score) + query_length = int(query_length) + target_length = int(target_length) + query_start = int(query_start) + target_start = int(target_start) + + if last_query_id != query_id: + best_match_found = 0 + best_score = 0 + best_identity = 0 + else: + # Ignore mappings with worse identity or score if we already found a good mapping + if ( + identity < best_identity or score < best_score + ) and best_match_found: + continue + + if ensembl_type == "Translation": + load_object_xref = 1 + else: + # Check if source name is RefSeq_ncRNA or RefSeq_mRNA + # If yes check biotype, if ok store object xref + source_name = dbi.execute( + select(SourceUORM.name) + .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) + .where(XrefUORM.xref_id == query_id) + ).scalar() + + if source_name and ( + re.search(r"^RefSeq_(m|nc)RNA", source_name) + or re.search(r"^miRBase", source_name) + or re.search(r"^RFAM", source_name) + ): + # Make sure mRNA xrefs are matched to protein_coding biotype only + biotype = dbi.execute( + select(TranscriptStableIdORM.biotype).where( + TranscriptStableIdORM.internal_id == target_id + ) + ).scalar() + + if re.search(r"^RefSeq_mRNA", source_name) and mRNA_biotypes.get( + biotype + ): + load_object_xref = 1 + if re.search( + r"^RefSeq_ncRNA", source_name + ) and not mRNA_biotypes.get(biotype): + load_object_xref = 1 + if ( + re.search(r"^miRBase", source_name) + or re.search(r"^RFAM", source_name) + ) and re.search("RNA", biotype): + load_object_xref = 1 + else: + load_object_xref = 1 + + last_query_id = query_id + + # Check if found a better match + if score > best_score or identity > best_identity: + best_score = score + best_identity = identity + + if not load_object_xref: + continue + else: + best_match_found = 1 + + if not score: + self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) + raise ValueError(f"No score on line. Possible file corruption\n{line}") + + # Calculate percentage identities + query_identity = int(100 * identity / query_length) + target_identity = int(100 * identity / target_length) + + # Only keep alignments where both sequences match cutoff + status = "DUMP_OUT" + if query_identity < query_cutoff or target_identity < target_cutoff: + status = "FAILED_CUTOFF" + + # Add object xref row + object_xref_id = self.get_object_xref_id( + target_id, query_id, ensembl_type, "SEQUENCE_MATCH", dbi, None, status + ) + if object_xref_id: + continue + else: + try: + object_xref_id = self.add_object_xref( + target_id, + query_id, + ensembl_type, + "SEQUENCE_MATCH", + dbi, + None, + status, + ) + except: + self.update_object_xref_end( + job_id, array_number, object_xref_id, dbi + ) + raise IOError(f"Problem adding object_xref row") + + if first: + self.update_object_xref_start(job_id, array_number, object_xref_id, dbi) + first = 0 + + cigar_line = re.sub(" ", "", cigar_line) + cigar_line = re.sub(r"([MDI])(\d+)", r"\2\1", cigar_line) + + # Add identity xref row + try: + identity_xref_query = insert(IdentityXrefUORM).values( + object_xref_id=object_xref_id, + query_identity=query_identity, + target_identity=target_identity, + hit_start=query_start + 1, + hit_end=query_end, + translation_start=target_start + 1, + translation_end=target_end, + cigar_line=cigar_line, + score=score, + ) + dbi.execute(identity_xref_query) + except: + self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) + raise IOError(f"Problem loading identity_xref") + + master_xref_ids = [query_id] + for master_xref_id in master_xref_ids: + # Get all dependents related to master xref + dep_query = select(DependentXrefUORM.dependent_xref_id).where( + DependentXrefUORM.master_xref_id == master_xref_id + ) + for dep in dbi.execute(dep_query).mappings().all(): + # Add dependent object xref + dep_object_xref_id = self.get_object_xref_id( + target_id, + dep.dependent_xref_id, + ensembl_type, + "DEPENDENT", + dbi, + master_xref_id, + status, + ) + if dep_object_xref_id: + continue + else: + try: + dep_object_xref_id = self.add_object_xref( + target_id, + dep.dependent_xref_id, + ensembl_type, + "DEPENDENT", + dbi, + master_xref_id, + status, + ) + except: + self.update_object_xref_end( + job_id, array_number, object_xref_id, dbi + ) + raise IOError(f"Problem adding dependent object xref row") + + # Add dependent identity xref + dbi.execute( + insert(IdentityXrefUORM).values( + object_xref_id=dep_object_xref_id, + query_identity=query_identity, + target_identity=target_identity, + ) + ) + + # Get the dependent dependents just in case + master_xref_ids.append(dep.dependent_xref_id) + + mh.close() + + self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) + return total_lines + + def update_object_xref_end(self, job_id: int, array_number: int, object_xref_id: int, dbi: Connection) -> None: + dbi.execute( + update(MappingJobsORM) + .where( + MappingJobsORM.job_id == job_id, + MappingJobsORM.array_number == array_number, + ) + .values(object_xref_end=object_xref_id) + ) + + def update_object_xref_start(self, job_id: int, array_number: int, object_xref_id: int, dbi: Connection) -> None: + dbi.execute( + update(MappingJobsORM) + .where( + MappingJobsORM.job_id == job_id, + MappingJobsORM.array_number == array_number, + ) + .values(object_xref_start=object_xref_id) + ) diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py new file mode 100644 index 000000000..c086cab01 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py @@ -0,0 +1,478 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for moving xref data onto appriopriate genes.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class ProcessMoves(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def biomart_testing(self, verbose: bool) -> None: + logging.info("Starting biomart testing") + + xref_dbi = self.xref().connect() + + again = 1 + while again: + again = 0 + + last_type, last_count, last_name = None, None, "DEFAULT" + + query = ( + select( + ObjectXrefUORM.ensembl_object_type, + SourceUORM.name, + func.count(ObjectXrefUORM.object_xref_id).label("count"), + ) + .where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + SourceUORM.source_id == XrefUORM.source_id, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + .group_by(SourceUORM.name, ObjectXrefUORM.ensembl_object_type) + ) + for row in xref_dbi.execute(query).mappings().all(): + if again: + break + + if last_name == row.name: + again = 1 + self.biomart_fix( + row.name, last_type, row.ensembl_object_type, xref_dbi + ) + + last_name = row.name + last_type = row.ensembl_object_type + last_count = row.count + + if self.unlinked_entries(verbose, xref_dbi): + raise ValueError("Problems found before source_defined_move") + + xref_dbi.close() + + self.update_process_status("biomart_test_finished") + + def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool: + failed = False + xref_id, count = None, None + + self.update_process_status("tests_started") + + # Get count of unlinked master xrefs + count = dbi.execute( + select(func.count(DependentXrefUORM.master_xref_id)) + .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id) + .where(XrefUORM.xref_id == None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} master xrefs") + + if verbose: + query = ( + select(DependentXrefUORM.master_xref_id.distinct()) + .outerjoin( + XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id + ) + .where(XrefUORM.xref_id == None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error(f"Problem with master xref {row.master_xref_id}") + + # Get count of unlinked dependent xrefs + count = dbi.execute( + select(func.count(DependentXrefUORM.dependent_xref_id)) + .outerjoin( + XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id + ) + .where(XrefUORM.xref_id == None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} dependent xrefs") + + if verbose: + query = ( + select(DependentXrefUORM.dependent_xref_id.distinct()) + .outerjoin( + XrefUORM, + XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, + ) + .where(XrefUORM.xref_id == None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error( + f"Problem with dependent xref {row.dependent_xref_id}" + ) + + # Get count of unlinked primary xrefs + count = dbi.execute( + select(func.count(PrimaryXrefORM.xref_id)) + .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id) + .where(XrefUORM.xref_id == None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} primary xrefs") + + if verbose: + query = ( + select(PrimaryXrefORM.xref_id.distinct()) + .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id) + .where(XrefUORM.xref_id == None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error(f"Problem with primary xref {row.xref_id}") + + db_tables = { + "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM}, + "transcript": { + "direct": TranscriptDirectXrefORM, + "stable_id": TranscriptStableIdORM, + }, + "translation": { + "direct": TranslationDirectXrefORM, + "stable_id": TranslationStableIdORM, + }, + } + + # Get count of unlinked direct xrefs + for object_type in ["transcript", "translation", "gene"]: + direct_table = db_tables[object_type]["direct"] + count = dbi.execute( + select(func.count(direct_table.general_xref_id)) + .outerjoin(XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id) + .where(XrefUORM.xref_id == None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} {object_type} direct xrefs") + + if verbose: + query = ( + select(direct_table.general_xref_id.distinct()) + .outerjoin( + XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id + ) + .where(XrefUORM.xref_id == None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error( + f"Problem with {object_type} direct xref {row.general_xref_id}" + ) + + # Get count of unlinked synonyms + count = dbi.execute( + select(func.count(SynonymORM.xref_id)) + .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id) + .where(XrefUORM.xref_id == None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} synonyms") + + if verbose: + query = ( + select(SynonymORM.xref_id.distinct()) + .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id) + .where(XrefUORM.xref_id == None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error(f"Problem with synonym {row.xref_id}") + + # Get count of unlinked identity object xrefs + count = dbi.execute( + select(func.count(IdentityXrefUORM.object_xref_id)) + .outerjoin( + ObjectXrefUORM, + ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, + ) + .where(ObjectXrefUORM.object_xref_id == None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} object xrefs") + + if verbose: + query = ( + select(IdentityXrefUORM.object_xref_id.distinct()) + .outerjoin( + ObjectXrefUORM, + ObjectXrefUORM.object_xref_id + == IdentityXrefUORM.object_xref_id, + ) + .where(ObjectXrefUORM.object_xref_id == None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error(f"Problem with object xref {row.object_xref_id}") + + # Get count of unlinked objects + for object_type in ["transcript", "translation", "gene"]: + id_column = getattr(GeneTranscriptTranslationORM, f"{object_type}_id") + stable_id_table = db_tables[object_type]["stable_id"] + + count = dbi.execute( + select(func.count(id_column)) + .outerjoin(stable_id_table, stable_id_table.internal_id == id_column) + .where(stable_id_table.internal_id == None, id_column != None) + ).scalar() + + if count: + failed = True + logging.error(f"Problem with {count} {object_type}_ids") + + if verbose: + query = ( + select(id_column.label("object_id").distinct()) + .outerjoin( + stable_id_table, stable_id_table.internal_id == id_column + ) + .where(stable_id_table.internal_id == None, id_column != None) + .limit(10) + ) + for row in dbi.execute(query).mappings().all(): + logging.error(f"Problem with {object_type}_id {row.object_id}") + + if not failed: + self.update_process_status("tests_finished") + else: + self.update_process_status("tests_failed") + + return failed + + def source_defined_move(self, verbose: bool) -> None: + xref_dbi = self.xref().connect() + + for source in self.get_gene_specific_list(xref_dbi): + self.biomart_fix(source, "Translation", "Gene", xref_dbi) + self.biomart_fix(source, "Transcript", "Gene", xref_dbi) + + if self.unlinked_entries(verbose, xref_dbi): + raise ValueError("Problems found after source_defined_move") + + xref_dbi.close() + + self.update_process_status("source_level_move_finished") + + def get_gene_specific_list(self, dbi: Connection) -> List[str]: + sources_list = [ + "DBASS3", + "DBASS5", + "EntrezGene", + "miRBase", + "RFAM", + "TRNASCAN_SE", + "RNAMMER", + "UniGene", + "Uniprot_gn", + "WikiGene", + "MIM_GENE", + "MIM_MORBID", + "HGNC", + "MGI", + "ZFIN_ID", + "FlyBaseName_gene", + "RGD", + "SGD_GENE", + "VGNC", + "wormbase_gseqname", + "wormbase_locus", + "Xenbase", + "GeneCards", + ] + + used_list = [] + count = None + + # Check that the sources are used in the database considered + for source in sources_list: + count = dbi.execute( + select(func.count(XrefUORM.xref_id)).where( + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name == source, + ) + ).scalar() + + if count > 0: + used_list.append(source) + + return used_list + + def process_alt_alleles(self, verbose: bool) -> None: + logging.info("Processing alt alleles") + + xref_dbi = self.xref().connect() + + alt_to_ref, ref_to_alts = self.get_alt_allele_hashes(xref_dbi) + gene_specific_list = self.get_gene_specific_list(xref_dbi) + + move_count, del_identity_xref_count, del_object_xref_count = 0, 0, 0 + + for gene_id, ref_gene in alt_to_ref.items(): + # Move the xrefs onto the reference Gene + query = ( + update(ObjectXrefUORM) + .where( + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ObjectXrefUORM.ox_status == "DUMP_OUT", + SourceUORM.name.in_(gene_specific_list), + ) + .values(ensembl_id=ref_gene) + .prefix_with("IGNORE") + ) + row_count = xref_dbi.execute(query).rowcount + move_count += row_count + + # Delete the related identity and object xrefs + query = delete(IdentityXrefUORM).where( + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ObjectXrefUORM.ox_status == "DUMP_OUT", + SourceUORM.name.in_(gene_specific_list), + ) + row_count = xref_dbi.execute(query).rowcount + del_identity_xref_count += row_count + + query = delete(ObjectXrefUORM).where( + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ObjectXrefUORM.ox_status == "DUMP_OUT", + SourceUORM.name.in_(gene_specific_list), + ) + row_count = xref_dbi.execute(query).rowcount + del_object_xref_count += row_count + + logging.info( + f"Number of rows: moved = {move_count}, identity_xrefs deleted = {del_identity_xref_count}, object_xrefs deleted = {del_object_xref_count}" + ) + + max_object_xref_id = xref_dbi.execute( + select(func.max(ObjectXrefUORM.object_xref_id)) + ).scalar() + max_object_xref_id = int(max_object_xref_id) + + if not max_object_xref_id: + raise LookupError("Problem getting max object_xref_id") + + added_count, ignored = 0, 0 + + # Copy the xref data related to the reference gene onto the alt alleles + for ref_gene, alts in ref_to_alts.items(): + # Get object and identity xref data related to the reference gene + query = ( + select(ObjectXrefUORM, IdentityXrefUORM) + .outerjoin( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .where( + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.ensembl_id == ref_gene, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_object_type == "Gene", + SourceUORM.name.in_(gene_specific_list), + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + for alt in alts: + max_object_xref_id += 1 + + query = insert(ObjectXrefUORM).values( + object_xref_id=max_object_xref_id, + ensembl_id=alt, + ensembl_object_type=row.ensembl_object_type, + xref_id=row.xref_id, + linkage_annotation=row.linkage_annotation, + linkage_type=row.linkage_type, + ox_status=row.ox_status, + unused_priority=row.unused_priority, + master_xref_id=row.master_xref_id, + ) + row_count = xref_dbi.execute(query).rowcount + + # Only add identity xref if object_xref was added successfully + if row_count: + added_count += 1 + + query = insert(IdentityXrefUORM).values( + object_xref_id=max_object_xref_id, + query_identity=row.query_identity, + target_identity=row.target_identity, + hit_start=row.hit_start, + hit_end=row.hit_end, + translation_start=row.translation_start, + translation_end=row.translation_end, + cigar_line=row.cigar_line, + score=row.score, + evalue=row.evalue, + ) + xref_dbi.execute(query) + else: + ignored += 1 + + logging.info(f"Added {added_count} new mappings and ignored {ignored}") + + if self.unlinked_entries(verbose, xref_dbi): + raise ValueError("Problems found after process_alt_alleles") + + xref_dbi.close() + + self.update_process_status("alt_alleles_processed") + + def get_alt_allele_hashes(self, dbi: Connection) -> Tuple[Dict[int, int], Dict[int, List[int]]]: + alt_to_ref, ref_to_alts = {}, {} + last_alt_allele, ref_gene = 0, None + + query = select( + AltAlleleUORM.alt_allele_id, + AltAlleleUORM.gene_id, + AltAlleleUORM.is_reference, + ).order_by(AltAlleleUORM.alt_allele_id, AltAlleleUORM.is_reference.desc()) + for row in dbi.execute(query).mappings().all(): + if row.alt_allele_id != last_alt_allele: + # Use the first non-reference gene if there is no reference gene in an alt_allele + ref_gene = row.gene_id + else: + alt_to_ref[row.gene_id] = ref_gene + ref_to_alts.setdefault(ref_gene, []).append(row.gene_id) + + last_alt_allele = row.alt_allele_id + + return alt_to_ref, ref_to_alts diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py new file mode 100644 index 000000000..0dcbfdff4 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py @@ -0,0 +1,248 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing paired xrefs.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class ProcessPaired(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def process(self) -> None: + logging.info("Processing paired xrefs") + + xref_dbi = self.xref().connect() + + object_xref_id = None + change = { + "translation object xrefs added": 0, + "translation object xrefs removed": 0, + } + RefSeq_pep_translation = {} + + # Get the transcript RefSeq_mRNA% object xrefs, and the paired RefSeq_peptide% accessions as well as the translation id for the transcript + query = ( + select( + ObjectXrefUORM.object_xref_id, + GeneTranscriptTranslationORM.translation_id, + PairsORM.source_id, + PairsORM.accession1, + IdentityXrefUORM.query_identity, + IdentityXrefUORM.target_identity, + ) + .join( + XrefUORM, + (XrefUORM.xref_id == ObjectXrefUORM.xref_id) + & (ObjectXrefUORM.ox_status == "DUMP_OUT"), + ) + .join( + SourceUORM, + (SourceUORM.source_id == XrefUORM.source_id) + & (SourceUORM.name.like("RefSeq_mRNA%")), + ) + .join(PairsORM, PairsORM.accession2 == XrefUORM.accession) + .join( + GeneTranscriptTranslationORM, + GeneTranscriptTranslationORM.transcript_id == ObjectXrefUORM.ensembl_id, + ) + .join( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + # Check if translation is linked to the paired RefSeq peptide + if row.translation_id: + query = ( + select(ObjectXrefUORM.object_xref_id, ObjectXrefUORM.xref_id) + .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) + .where( + ObjectXrefUORM.ox_status.in_(["DUMP_OUT", "FAILED_PRIORITY"]), + ObjectXrefUORM.ensembl_object_type == "Translation", + ObjectXrefUORM.ensembl_id == row.translation_id, + XrefUORM.source_id == row.source_id, + XrefUORM.accession == row.accession1, + ) + ) + result = xref_dbi.execute(query) + if result.rowcount > 0: + object_xref_row = result.mappings().all()[0] + transl_object_xref_id = object_xref_row.object_xref_id + else: + transl_object_xref_id = None + + # If it's already linked we don't have to do anything + if not transl_object_xref_id: + # Get the associated xref ID + xref_id = xref_dbi.execute( + select(XrefUORM.xref_id).where( + XrefUORM.accession == row.accession1, + XrefUORM.source_id == row.source_id, + ) + ).scalar() + + if not xref_id: + raise LookupError( + f"Xref not found for accession {row.accession1} source_id {row.source_id}" + ) + + # Add a new object xref + object_xref_id = self.add_object_xref( + row.translation_id, + xref_id, + "Translation", + "INFERRED_PAIR", + xref_dbi, + None, + "DUMP_OUT", + ) + + # Update info type for xref + xref_dbi.execute( + update(XrefUORM) + .where(XrefUORM.xref_id == xref_id) + .values(info_type="INFERRED_PAIR") + ) + + # Also insert into identity_xref if needed + if row.query_identity and row.target_identity: + xref_dbi.execute( + insert(IdentityXrefUORM).values( + object_xref_id=object_xref_id, + query_identity=row.query_identity, + target_identity=row.target_identity, + ) + ) + + change["translation object xrefs added"] += 1 + transl_object_xref_id = object_xref_id + + if transl_object_xref_id: + RefSeq_pep_translation.setdefault(row.accession1, []).append( + row.translation_id + ) + + # Go through RefSeq_peptide% object_xrefs + query = ( + select( + ObjectXrefUORM.object_xref_id, + ObjectXrefUORM.ensembl_id, + XrefUORM.accession, + GeneTranscriptTranslationORM.transcript_id, + ) + .join( + ObjectXrefUORM, + ( + ObjectXrefUORM.ensembl_id + == GeneTranscriptTranslationORM.translation_id + ) + & (ObjectXrefUORM.ensembl_object_type == "Translation"), + ) + .join( + XrefUORM, + (XrefUORM.xref_id == ObjectXrefUORM.xref_id) + & (ObjectXrefUORM.ox_status == "DUMP_OUT") + & (ObjectXrefUORM.ensembl_object_type == "Translation"), + ) + .join( + SourceUORM, + (SourceUORM.source_id == XrefUORM.source_id) + & (SourceUORM.name.like("RefSeq_peptide%")), + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + if RefSeq_pep_translation.get(row.accession): + found = 0 + for tr_id in RefSeq_pep_translation[row.accession]: + if tr_id == row.ensembl_id: + found = 1 + + if not found: + # This translations's transcript is not matched with the paired RefSeq_mRNA%, + # change the status to 'MULTI_DELETE' + self.update_object_xref_status( + row.object_xref_id, "MULTI_DELETE", xref_dbi + ) + + # Process all dependent xrefs as well + self.process_dependents( + row.object_xref_id, row.ensembl_id, row.transcript_id, xref_dbi + ) + + change["translation object xrefs removed"] += 1 + + for key, val in change.items(): + logging.info(f"{key}:\t{val}") + + xref_dbi.close() + + self.update_process_status("processed_pairs") + + def process_dependents(self, translation_object_xref_id: int, translation_id: int, transcript_id: int, dbi: Connection) -> None: + master_object_xrefs = [] + new_master_object_xref_id = None + master_object_xref_ids = {} + + master_object_xrefs.append(translation_object_xref_id) + master_object_xref_ids[translation_object_xref_id] = 1 + + while master_object_xrefs: + master_object_xref_id = master_object_xrefs.pop() + dependent_object_xref_id = None + + MasterObjectXref = aliased(ObjectXrefUORM) + DependentObjectXref = aliased(ObjectXrefUORM) + + MasterXref = aliased(XrefUORM) + DependentXref = aliased(XrefUORM) + + query = select(DependentObjectXref.object_xref_id.distinct()).where( + DependentXref.xref_id == DependentXrefUORM.dependent_xref_id, + MasterXref.xref_id == DependentXrefUORM.master_xref_id, + DependentXref.xref_id == DependentObjectXref.xref_id, + MasterXref.xref_id == MasterObjectXref.xref_id, + MasterObjectXref.object_xref_id == master_object_xref_id, + DependentObjectXref.master_xref_id == MasterXref.xref_id, + DependentObjectXref.ensembl_id == translation_id, + DependentObjectXref.ensembl_object_type == "Translation", + DependentObjectXref.ox_status == "DUMP_OUT", + ) + for row in dbi.execute(query).mappings().all(): + self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi) + + if not master_object_xref_ids.get(row.object_xref_id): + master_object_xref_ids[row.object_xref_id] = 1 + master_object_xrefs.append(row.object_xref_id) + + query = select(DependentObjectXref.object_xref_id.distinct()).where( + DependentXref.xref_id == DependentXrefUORM.dependent_xref_id, + MasterXref.xref_id == DependentXrefUORM.master_xref_id, + DependentXref.xref_id == DependentObjectXref.xref_id, + MasterXref.xref_id == MasterObjectXref.xref_id, + MasterObjectXref.object_xref_id == master_object_xref_id, + DependentObjectXref.master_xref_id == MasterXref.xref_id, + DependentObjectXref.ensembl_id == transcript_id, + DependentObjectXref.ensembl_object_type == "Transcript", + DependentObjectXref.ox_status == "DUMP_OUT", + ) + for row in dbi.execute(query).mappings().all(): + self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi) + + if not master_object_xref_ids.get(row.object_xref_id): + master_object_xref_ids[row.object_xref_id] = 1 + master_object_xrefs.append(row.object_xref_id) diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py new file mode 100644 index 000000000..ba212ddf6 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py @@ -0,0 +1,408 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing xref priorities.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class ProcessPriorities(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def process(self) -> None: + logging.info("Processing priorities") + + xref_dbi = self.xref().connect() + + names = self.get_priority_names(xref_dbi) + + for name in names: + logging.info(f"'{name}' will be processed as priority xrefs") + + # Set to failed all those that have no object xrefs + query = ( + select(XrefUORM.xref_id) + .outerjoin(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id) + .where( + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name == name, + ObjectXrefUORM.object_xref_id == None, + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + self.update_xref_dumped( + row.xref_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi + ) + + # Now ALL object_xrefs have an identity_xref + # So we can do a straight join and treat all info_types the same way + for name in names: + last_acc, last_name, best_xref_id, last_xref_id, seen = "", "", None, 0, 0 + best_ensembl_id, gone = [], [] + + query = ( + select( + ObjectXrefUORM.object_xref_id, + XrefUORM.accession, + XrefUORM.xref_id, + ( + IdentityXrefUORM.query_identity + + IdentityXrefUORM.target_identity + ).label("identity"), + ObjectXrefUORM.ox_status, + ObjectXrefUORM.ensembl_object_type, + ObjectXrefUORM.ensembl_id, + XrefUORM.info_type, + ) + .where( + ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name == name, + ) + .order_by( + XrefUORM.accession.desc(), + SourceUORM.priority, + desc("identity"), + XrefUORM.xref_id.desc(), + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + if last_acc == row.accession: + if row.xref_id != best_xref_id: + # We've already seen this accession before, and this xref_id is not the best one + seen = row.xref_id == last_xref_id + last_xref_id = row.xref_id + + # If xref is a sequence_match, we want to copy the alignment identity_xref to prioritised mappings of the same ensembl_id + if row.info_type == "SEQUENCE_MATCH": + identity_xref_row, object_xref_row = None, None + + query = select(IdentityXrefUORM).where( + IdentityXrefUORM.object_xref_id == row.object_xref_id + ) + result = xref_dbi.execute(query) + if result.rowcount > 0: + identity_xref_row = result.mappings().all()[0] + + query = select(ObjectXrefUORM.object_xref_id).where( + ObjectXrefUORM.xref_id == best_xref_id, + ObjectXrefUORM.ensembl_object_type + == row.ensembl_object_type, + ObjectXrefUORM.ensembl_id == row.ensembl_id, + ) + result = xref_dbi.execute(query) + if result.rowcount > 0: + object_xref_row = result.mappings().all()[0] + + if identity_xref_row and object_xref_row: + query = ( + update(IdentityXrefUORM) + .where( + IdentityXrefUORM.object_xref_id + == object_xref_row.object_xref_id + ) + .values( + query_identity=identity_xref_row.query_identity, + target_identity=identity_xref_row.target_identity, + hit_start=identity_xref_row.hit_start, + hit_end=identity_xref_row.hit_end, + translation_start=identity_xref_row.translation_start, + translation_end=identity_xref_row.translation_end, + cigar_line=identity_xref_row.cigar_line, + score=identity_xref_row.score, + evalue=identity_xref_row.evalue, + ) + ) + xref_dbi.execute(query) + + # If the xref is marked DUMP_OUT, set it to FAILED_PRIORITY + if row.ox_status == "DUMP_OUT": + xref_dbi.execute( + update(ObjectXrefUORM) + .where( + ObjectXrefUORM.object_xref_id == row.object_xref_id + ) + .values(ox_status="FAILED_PRIORITY") + ) + + # If it is the first time processing this xref_id, also process dependents and update status + if not seen: + self.update_xref_dumped( + row.xref_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi + ) + + # Copy synonyms across if they are missing + query = select(SynonymORM.synonym).where( + SynonymORM.xref_id == row.xref_id + ) + for synonym_row in ( + xref_dbi.execute(query).mappings().all() + ): + xref_dbi.execute( + insert(SynonymORM) + .values( + xref_id=best_xref_id, + synonym=synonym_row.synonym, + ) + .prefix_with("IGNORE") + ) + + self.process_dependents( + row.xref_id, best_xref_id, xref_dbi + ) + else: + # Status is not DUMP_OUT + self.update_xref_dumped( + row.xref_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi + ) + else: + # Alignment did not pass, dismiss + if row.ox_status == "FAILED_CUTOFF": + continue + + # There might be several mappings for the best priority + best_ensembl_id.append(row.ensembl_id) + + # Best priority failed so another one now found so set dumped + if len(gone) > 0: + if last_name == row.accession: + for x_id in gone: + self.update_xref_dumped( + x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi + ) + else: + # New xref_id + if row.ox_status == "DUMP_OUT": + last_acc = row.accession + best_xref_id = row.xref_id + best_ensembl_id = [row.ensembl_id] + + if len(gone) > 0 and last_name == row.accession: + for x_id in gone: + self.update_xref_dumped( + x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi + ) + gone = [] + else: + # New xref_id not DUMP_OUT + if last_name != row.accession: + gone = [] + + gone.append(row.xref_id) + last_name = row.accession + + xref_dbi.close() + + self.update_process_status("priorities_flagged") + + def get_priority_names(self, dbi: Connection) -> List[str]: + names = [] + seen = {} + last_name = "rubbish" + + query = ( + select( + SourceUORM.priority_description.label("description"), SourceUORM.name + ) + .where(SourceUORM.source_id == XrefUORM.source_id) + .group_by(SourceUORM.priority_description, SourceUORM.name) + .order_by(SourceUORM.name) + ) + for row in dbi.execute(query).mappings().all(): + if row.name == last_name and not seen.get(row.name): + names.append(row.name) + seen[row.name] = 1 + last_name = row.name + + return names + + def update_xref_dumped(self, xref_id: int, dumped: str, dbi: Connection) -> None: + dbi.execute( + update(XrefUORM).where(XrefUORM.xref_id == xref_id).values(dumped=dumped) + ) + + def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, dbi: Connection) -> None: + master_xrefs = [old_master_xref_id] + recursive = 0 + + # Create a hash of all possible mappings for this accession + ensembl_ids = {} + query = ( + select( + ObjectXrefUORM.ensembl_object_type.distinct(), ObjectXrefUORM.ensembl_id + ) + .where( + ObjectXrefUORM.ox_status != "FAILED_CUTOFF", + ObjectXrefUORM.xref_id == new_master_xref_id, + ) + .order_by(ObjectXrefUORM.ensembl_object_type) + ) + for row in dbi.execute(query).mappings().all(): + ensembl_ids.setdefault(row.ensembl_object_type, []).append(row.ensembl_id) + + old_ensembl_ids = {} + query = ( + select( + ObjectXrefUORM.ensembl_object_type.distinct(), ObjectXrefUORM.ensembl_id + ) + .where( + ObjectXrefUORM.ox_status != "FAILED_CUTOFF", + ObjectXrefUORM.xref_id == old_master_xref_id, + ) + .order_by(ObjectXrefUORM.ensembl_object_type) + ) + for row in dbi.execute(query).mappings().all(): + old_ensembl_ids.setdefault(row.ensembl_object_type, []).append( + row.ensembl_id + ) + + # Loop through all dependent xrefs of old master xref, and recurse + while master_xrefs: + xref_id = master_xrefs.pop() + + if recursive: + new_master_xref_id = xref_id + + # Get dependent xrefs, be they gene, transcript or translation + query = ( + select( + DependentXrefUORM.dependent_xref_id.distinct(), + DependentXrefUORM.linkage_annotation, + DependentXrefUORM.linkage_source_id, + ObjectXrefUORM.ensembl_object_type, + ) + .where( + ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, + ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id, + DependentXrefUORM.master_xref_id == xref_id, + ) + .order_by(ObjectXrefUORM.ensembl_object_type) + ) + for row in dbi.execute(query).mappings().all(): + # Remove all mappings to low priority xrefs + # Then delete any leftover identity xrefs of it + for ensembl_id in old_ensembl_ids.get(row.ensembl_object_type): + self._detach_object_xref( + xref_id, + row.dependent_xref_id, + row.ensembl_object_type, + ensembl_id, + dbi, + ) + + # Duplicate each dependent for the new master xref if it is the first in the chain + if not recursive: + dbi.execute( + insert(DependentXrefUORM) + .values( + master_xref_id=new_master_xref_id, + dependent_xref_id=row.dependent_xref_id, + linkage_annotation=row.linkage_annotation, + linkage_source_id=row.linkage_source_id, + ) + .prefix_with("IGNORE") + ) + + # Loop through all chosen (best) ensembl ids mapped to priority xref, and connect them with object_xrefs + for ensembl_id in ensembl_ids.get(row.ensembl_object_type): + # Add new object_xref for each best_ensembl_id + dbi.execute( + insert(ObjectXrefUORM) + .values( + master_xref_id=new_master_xref_id, + ensembl_object_type=row.ensembl_object_type, + ensembl_id=ensembl_id, + linkage_type="DEPENDENT", + ox_status="DUMP_OUT", + xref_id=row.dependent_xref_id, + ) + .prefix_with("IGNORE") + ) + + # Get inserted ID + query = select(ObjectXrefUORM.object_xref_id).where( + ObjectXrefUORM.master_xref_id == new_master_xref_id, + ObjectXrefUORM.ensembl_object_type == row.ensembl_object_type, + ObjectXrefUORM.ensembl_id == ensembl_id, + ObjectXrefUORM.linkage_type == "DEPENDENT", + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.xref_id == row.dependent_xref_id, + ) + for object_xref_row in dbi.execute(query).mappings().all(): + dbi.execute( + insert(IdentityXrefUORM) + .values( + object_xref_id=object_xref_row.object_xref_id, + query_identity=100, + target_identity=100, + ) + .prefix_with("IGNORE") + ) + + if row.dependent_xref_id != xref_id: + master_xrefs.append(row.dependent_xref_id) + + recursive = 1 + + def _detach_object_xref(self, xref_id: int, dependent_xref_id: int, object_type: str, ensembl_id: int, dbi: Connection) -> None: + # Drop all the identity and go xrefs for the dependents of an xref + query = ( + select(ObjectXrefUORM.object_xref_id) + .outerjoin( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .where( + ObjectXrefUORM.master_xref_id == xref_id, + ObjectXrefUORM.ensembl_object_type == object_type, + ObjectXrefUORM.xref_id == dependent_xref_id, + ObjectXrefUORM.ensembl_id == ensembl_id, + ) + ) + result = dbi.execute(query).fetchall() + object_xref_ids = [row[0] for row in result] + + dbi.execute( + delete(IdentityXrefUORM).where( + IdentityXrefUORM.object_xref_id.in_(object_xref_ids) + ) + ) + + # Change status of object_xref to FAILED_PRIORITY for record keeping + dbi.execute( + update(ObjectXrefUORM) + .where( + ObjectXrefUORM.master_xref_id == xref_id, + ObjectXrefUORM.ensembl_object_type == object_type, + ObjectXrefUORM.xref_id == dependent_xref_id, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_id == ensembl_id, + ) + .values(ox_status="FAILED_PRIORITY") + ) + + # Delete the duplicates + dbi.execute( + delete(ObjectXrefUORM).where( + ObjectXrefUORM.master_xref_id == xref_id, + ObjectXrefUORM.ensembl_object_type == object_type, + ObjectXrefUORM.xref_id == dependent_xref_id, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_id == ensembl_id, + ) + ) diff --git a/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py new file mode 100644 index 000000000..473af5a69 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py @@ -0,0 +1,28 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing RNACentral xref data.""" + +from ensembl.production.xrefs.mappers.ChecksumMapper import * + + +class RNACentralMapper(ChecksumMapper): + def target(self) -> str: + return self.mapper().dna_file() + + def external_db_name(self) -> str: + return "RNAcentral" + + def object_type(self) -> str: + return "Transcript" diff --git a/src/python/ensembl/production/xrefs/mappers/TestMappings.py b/src/python/ensembl/production/xrefs/mappers/TestMappings.py new file mode 100644 index 000000000..4511741d1 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/TestMappings.py @@ -0,0 +1,199 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for running validity checks on xref data.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class TestMappings(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def direct_stable_id_check(self) -> int: + xref_dbi = self.xref().connect() + + db_tables = { + "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM}, + "transcript": { + "direct": TranscriptDirectXrefORM, + "stable_id": TranscriptStableIdORM, + }, + "translation": { + "direct": TranslationDirectXrefORM, + "stable_id": TranslationStableIdORM, + }, + } + + total_warnings_count = 0 + + for object_type in ["gene", "transcript", "translation"]: + warnings_count = 0 + direct_table = db_tables[object_type]["direct"] + stable_id_table = db_tables[object_type]["stable_id"] + + query = ( + select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count")) + .join(XrefUORM, SourceUORM.source_id == XrefUORM.source_id) + .join(direct_table, XrefUORM.xref_id == direct_table.general_xref_id) + .outerjoin( + stable_id_table, + stable_id_table.stable_id == direct_table.ensembl_stable_id, + ) + .where(stable_id_table.stable_id == None) + .group_by(SourceUORM.name) + ) + for row in xref_dbi.execute(query).mappings().all(): + logging.warn( + f"{row.name} has {row.count} invalid stable IDs in {object_type}_direct_xref" + ) + warnings_count += 1 + + total_warnings_count += warnings_count + + xref_dbi.close() + + self.update_process_status("direct_stable_id_check_done") + + return total_warnings_count + + def xrefs_counts_check(self) -> int: + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + warnings_count = 0 + core_count, xref_count = {}, {} + + # TO DO: sqlalchemy syntax -- can't figure out how to count 2 columns + xref_query = f'SELECT s.name, COUNT(DISTINCT x.xref_id, ox.ensembl_id) AS count FROM xref x, object_xref ox, source s WHERE ox.xref_id = x.xref_id AND x.source_id = s.source_id AND ox_status = "DUMP_OUT" GROUP BY s.name' + for row in xref_dbi.execute(text(xref_query)).mappings().all(): + xref_count[row.name] = row.count + + query = ( + select( + ExternalDbORM.db_name, + func.count(ObjectXrefCORM.object_xref_id).label("count"), + ) + .where( + XrefCORM.xref_id == ObjectXrefCORM.xref_id, + XrefCORM.external_db_id == ExternalDbORM.external_db_id, + ) + .filter((XrefCORM.info_type == None) | (XrefCORM.info_type != "PROJECTION")) + .group_by(ExternalDbORM.db_name) + ) + for row in core_dbi.execute(query).mappings().all(): + change = 0 + core_count[row.db_name] = row.count + + if xref_count.get(row.db_name): + change = ((xref_count[row.db_name] - row.count) / row.count) * 100 + + if change > 5: + logging.warn( + f"{row.db_name} has increased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB" + ) + warnings_count += 1 + elif change < -5: + logging.warn( + f"{row.db_name} has decreased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB" + ) + warnings_count += 1 + else: + logging.warn( + f"{row.db_name} xrefs are not in the xref DB but {row.count} are in the core DB" + ) + warnings_count += 1 + + for name, count in xref_count.items(): + if not core_count.get(name): + logging.warn( + f"{name} has {count} xrefs in the xref DB but none in the core DB" + ) + warnings_count += 1 + + xref_dbi.close() + core_dbi.close() + + self.update_process_status("xrefs_counts_check_done") + + return warnings_count + + def name_change_check(self, official_name: str = None) -> int: + if not official_name: + return 0 + + new_name, id_to_stable_id, alias = {}, {}, {} + warnings_count, total_count = 0, 0 + + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + query = select( + XrefUORM.label, GeneStableIdORM.internal_id, GeneStableIdORM.stable_id + ).where( + XrefUORM.xref_id == ObjectXrefUORM.object_xref_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + GeneStableIdORM.internal_id == ObjectXrefUORM.ensembl_id, + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name.like(f"{official_name}_%"), + ) + for row in xref_dbi.execute(query).mappings().all(): + new_name[row.internal_id] = row.label + id_to_stable_id[row.internal_id] = row.stable_id + + query = ( + select(XrefUORM.label, SynonymORM.synonym) + .where( + XrefUORM.xref_id == SynonymORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ) + .filter( + (SourceUORM.name.like(f"{official_name}_%")) + | (SourceUORM.name.like("EntrezGene")) + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + alias[row.synonym] = row.label + + query = select(XrefCORM.display_label, GeneORM.gene_id).where( + XrefCORM.xref_id == GeneORM.display_xref_id, + GeneORM.biotype == "protein_coding", + ) + for row in core_dbi.execute(query).mappings().all(): + if new_name.get(row.gene_id): + total_count += 1 + + if new_name.get(row.gene_id) and new_name[row.gene_id] != row.display_label: + if ( + not alias.get(row.display_label) + or alias.get(row.display_label) != new_name[row.gene_id] + ): + logging.warn( + f"gene ID ({row.gene_id}) {id_to_stable_id[row.gene_id]} new = {new_name[row.gene_id]} old = {row.display_label}" + ) + warnings_count += 1 + + if total_count: + logging.warn( + f"{warnings_count} entries with different names out of {total_count} protein coding gene comparisons" + ) + + xref_dbi.close() + core_dbi.close() + + self.update_process_status("name_change_check_done") + + return warnings_count diff --git a/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py new file mode 100644 index 000000000..f518303bb --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py @@ -0,0 +1,28 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for processing UniParc xref data.""" + +from ensembl.production.xrefs.mappers.ChecksumMapper import * + + +class UniParcMapper(ChecksumMapper): + def target(self) -> str: + return self.mapper().protein_file() + + def external_db_name(self) -> str: + return "UniParc" + + def object_type(self) -> str: + return "Translation" diff --git a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py new file mode 100644 index 000000000..be634d870 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py @@ -0,0 +1,804 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper module for loading xref data into the core DB.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class XrefLoader(BasicMapper): + def __init__(self, mapper: BasicMapper) -> None: + self.xref(mapper.xref()) + self.core(mapper.core()) + mapper.set_up_logging() + + def update(self, species_name: str) -> None: + logging.info("Loading xrefs into core DB") + + xref_dbi = self.xref().connect() + core_dbi = self.core().connect() + + # Delete xref data related to projections + self.delete_projection_data(core_dbi) + + # Get the source IDs of relevant external DBs + name_to_external_db_id, source_id_to_external_db_id = {}, {} + + query = select(ExternalDbORM.external_db_id, ExternalDbORM.db_name) + for row in core_dbi.execute(query).mappings().all(): + name_to_external_db_id[row.db_name] = row.external_db_id + + query = ( + select(SourceUORM.source_id, SourceUORM.name) + .where(SourceUORM.source_id == XrefUORM.source_id) + .group_by(SourceUORM.source_id) + ) + for row in xref_dbi.execute(query).mappings().all(): + if name_to_external_db_id.get(row.name): + source_id_to_external_db_id[row.source_id] = name_to_external_db_id[ + row.name + ] + elif re.search(r"notransfer$", row.name): + continue + else: + raise LookupError( + f"Could not find {row.name} in external_db table in the core DB" + ) + + # Reset dumped field in case module is running again + xref_dbi.execute( + update(XrefUORM) + .values(dumped=None) + .where(XrefUORM.dumped != "NO_DUMP_ANOTHER_PRIORITY") + ) + + # Delete existing xrefs in core DB (only from relevant sources) + self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi, core_dbi) + + # Get the offsets for xref and object_xref tables + # This is used to track the xrefs whe mapping onto the core DB + xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() + object_xref_offset = core_dbi.execute( + select(func.max(ObjectXrefCORM.object_xref_id)) + ).scalar() + + if not xref_offset: + xref_offset = 0 + else: + xref_offset = int(xref_offset) + self.add_meta_pair("xref_offset", xref_offset) + if not object_xref_offset: + object_xref_offset = 0 + else: + object_xref_offset = int(object_xref_offset) + self.add_meta_pair("object_xref_offset", object_xref_offset) + + logging.info( + f"DB offsets: xref={xref_offset}, object_xref={object_xref_offset}" + ) + + # Get analysis IDs + analysis_ids = self.get_analysis(core_dbi) + + # Prepare some queries + xref_object_query = ( + select(XrefUORM, ObjectXrefUORM) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ) + .order_by(XrefUORM.xref_id) + ) + xref_object_identity_query = ( + select(XrefUORM, ObjectXrefUORM, IdentityXrefUORM) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ) + .order_by(XrefUORM.xref_id) + ) + + #### TO DO: transaction + + # Get source info from xref DB + query = ( + select( + SourceUORM.source_id, + SourceUORM.name, + XrefUORM.info_type, + func.count(XrefUORM.xref_id).label("count"), + SourceUORM.priority_description, + SourceUORM.source_release, + ) + .where( + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + .group_by(SourceUORM.source_id, SourceUORM.name, XrefUORM.info_type) + ) + for source_row in xref_dbi.execute(query).mappings().all(): + # We only care about specific sources + if not name_to_external_db_id.get(source_row.name): + continue + logging.info( + f"Updating source '{source_row.name}' ({source_row.source_id}) in core" + ) + + where_from = source_row.priority_description + if where_from: + where_from = f"Generated via {where_from}" + + external_id = name_to_external_db_id[source_row.name] + xref_list = [] + + if ( + source_row.info_type == "DIRECT" + or source_row.info_type == "INFERRED_PAIR" + or source_row.info_type == "MISC" + ): + count, last_xref_id = 0, 0 + + # Get all direct, inferred pair and misc xrefs from intermediate DB + query = xref_object_identity_query.where( + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == source_row.info_type, + ) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + info_text = xref_row.info_text + if not info_text: + info_text = where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": xref_row.label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, core_dbi) + last_xref_id = xref_id + + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids[xref_row.ensembl_object_type], + } + object_xref_id = self.add_object_xref( + object_xref_offset, object_xref_args, core_dbi + ) + + # Add identity xref into core DB + if xref_row.translation_start: + query = ( + insert(IdentityXrefCORM) + .values( + object_xref_id=object_xref_id + object_xref_offset, + xref_identity=xref_row.query_identity, + ensembl_identity=xref_row.target_identity, + xref_start=xref_row.hit_start, + xref_end=xref_row.hit_end, + ensembl_start=xref_row.translation_start, + ensembl_end=xref_row.translation_end, + cigar_line=xref_row.cigar_line, + score=xref_row.score, + evalue=xref_row.evalue, + ) + .prefix_with("IGNORE") + ) + core_dbi.execute(query) + + logging.info( + f"\tLoaded {count} {source_row.info_type} xrefs for '{species_name}'" + ) + elif source_row.info_type == "CHECKSUM": + count, last_xref_id = 0, 0 + + # Get all checksum xrefs from intermediate DB + query = xref_object_query.where( + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == source_row.info_type, + ) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + info_text = xref_row.info_text + if not info_text: + info_text = where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": xref_row.label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, core_dbi) + last_xref_id = xref_id + + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids["checksum"], + } + object_xref_id = self.add_object_xref( + object_xref_offset, object_xref_args, core_dbi + ) + + logging.info(f"\tLoaded {count} CHECKSUM xrefs for '{species_name}'") + elif source_row.info_type == "DEPENDENT": + count, last_xref_id, last_ensembl_id, master_error_count = 0, 0, 0, 0 + master_problems = [] + + # Get all dependent xrefs from intermediate DB + MasterXref = aliased(XrefUORM) + query = ( + select(XrefUORM, ObjectXrefUORM) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.master_xref_id == MasterXref.xref_id, + MasterXref.source_id == SourceUORM.source_id, + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == "DEPENDENT", + ) + .order_by( + XrefUORM.xref_id, ObjectXrefUORM.ensembl_id, SourceUORM.ordered + ) + ) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + label = xref_row.label + if not label: + label = xref_row.accession + info_text = xref_row.info_text + if not info_text: + info_text = where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, core_dbi) + + if ( + last_xref_id != xref_id + or last_ensembl_id != xref_row.ensembl_id + ): + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids[xref_row.ensembl_object_type], + } + object_xref_id = self.add_object_xref( + object_xref_offset, object_xref_args, core_dbi + ) + + if xref_row.master_xref_id: + # Add dependent xref into core DB + core_dbi.execute( + insert(DependentXrefCORM) + .values( + object_xref_id=object_xref_id + object_xref_offset, + master_xref_id=xref_row.master_xref_id + + xref_offset, + dependent_xref_id=xref_id + xref_offset, + ) + .prefix_with("IGNORE") + ) + else: + if master_error_count < 10: + master_problems.append(xref_row.accession) + + master_error_count += 1 + + last_xref_id = xref_id + last_ensembl_id = xref_row.ensembl_id + + if len(master_problems) > 0: + logging.warn( + f"For {source_row.name}, there were {master_error_count} problem master xrefs. Examples are: " + + ", ".join(master_problems) + ) + + logging.info(f"\tLoaded {count} DEPENDENT xrefs for '{species_name}'") + elif source_row.info_type == "SEQUENCE_MATCH": + count, last_xref_id = 0, 0 + + # Get all direct, inferred pair and misc xrefs from intermediate DB + query = xref_object_identity_query.where( + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == source_row.info_type, + ) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + info_text = xref_row.info_text + if not info_text: + info_text = where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": xref_row.label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, core_dbi) + last_xref_id = xref_id + + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids[xref_row.ensembl_object_type], + } + object_xref_id = self.add_object_xref( + object_xref_offset, object_xref_args, core_dbi + ) + + # Add identity xref into core DB + query = ( + insert(IdentityXrefCORM) + .values( + object_xref_id=object_xref_id + object_xref_offset, + xref_identity=xref_row.query_identity, + ensembl_identity=xref_row.target_identity, + xref_start=xref_row.hit_start, + xref_end=xref_row.hit_end, + ensembl_start=xref_row.translation_start, + ensembl_end=xref_row.translation_end, + cigar_line=xref_row.cigar_line, + score=xref_row.score, + evalue=xref_row.evalue, + ) + .prefix_with("IGNORE") + ) + core_dbi.execute(query) + + logging.info( + f"\tLoaded {count} SEQUENCE_MATCH xrefs for '{species_name}'" + ) + else: + logging.debug(f"\tPROBLEM: what type is {source_row.info_type}") + + # Transfer synonym data + if len(xref_list) > 0: + syn_count = 0 + + # Get synonyms + query = select(SynonymORM.xref_id, SynonymORM.synonym).where( + SynonymORM.xref_id.in_(xref_list) + ) + for syn_row in xref_dbi.execute(query).mappings().all(): + core_dbi.execute( + insert(ExternalSynonymORM).values( + xref_id=syn_row.xref_id + xref_offset, + synonym=syn_row.synonym, + ) + ) + + syn_count += 1 + + logging.info(f"\tLoaded {syn_count} synonyms for '{species_name}'") + + # Set dumped status + xref_dbi.execute( + update(XrefUORM) + .values(dumped="MAPPED") + .where(XrefUORM.xref_id.in_(xref_list)) + ) + + # Update release info + if source_row.source_release and source_row.source_release != "1": + core_dbi.execute( + update(ExternalDbORM) + .values(db_release=source_row.source_release) + .where(ExternalDbORM.external_db_id == external_id) + ) + + # Update the unmapped xrefs + self.update_unmapped_xrefs(xref_dbi) + + self.update_process_status("core_loaded") + + xref_dbi.close() + core_dbi.close() + + def delete_projection_data(self, dbi: Connection) -> None: + # Delete all the projections from the core DB + + dbi.execute(delete(OntologyXrefORM)) + logging.info("Deleted all ontology_xref rows") + + row_count = dbi.execute( + update(GeneORM) + .values(display_xref_id=None, description=None) + .where( + XrefCORM.xref_id == GeneORM.display_xref_id, + XrefCORM.info_type == "PROJECTION", + ) + ).rowcount + logging.info( + f"Set display_xref_id and description to NULL in {row_count} gene row(s) related to PROJECTION xrefs" + ) + + counts = {} + counts["external_synonym"] = dbi.execute( + delete(ExternalSynonymORM).where( + XrefCORM.xref_id == ExternalSynonymORM.xref_id, + XrefCORM.info_type == "PROJECTION", + ) + ).rowcount + counts["dependent_xref"] = dbi.execute( + delete(DependentXrefCORM).where( + XrefCORM.xref_id == DependentXrefCORM.dependent_xref_id, + XrefCORM.info_type == "PROJECTION", + ) + ).rowcount + counts["object_xref"] = dbi.execute( + delete(ObjectXrefCORM).where( + XrefCORM.xref_id == ObjectXrefCORM.xref_id, + XrefCORM.info_type == "PROJECTION", + ) + ).rowcount + counts["xref"] = dbi.execute( + delete(XrefCORM).where(XrefCORM.info_type == "PROJECTION") + ).rowcount + + logging.info( + f"Deleted all PROJECTIONs rows: {counts['external_synonym']} external_synonyms, {counts['dependent_xref']} dependent_xrefs, {counts['object_xref']} object_xrefs, {counts['xref']} xrefs" + ) + + def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection, core_dbi: Connection) -> None: + # For each external_db to be updated, delete the existing xrefs + query = ( + select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count")) + .where( + XrefUORM.xref_id == ObjectXrefUORM.xref_id, + XrefUORM.source_id == SourceUORM.source_id, + ) + .group_by(SourceUORM.name) + ) + for row in xref_dbi.execute(query).mappings().all(): + if not name_to_external_db_id.get(row.name): + continue + + name = row.name + external_db_id = name_to_external_db_id[name] + counts = {"master_dependent_xref": 0, "master_object_xref": 0} + + logging.info(f"For source '{name}'") + + counts["gene"] = core_dbi.execute( + update(GeneORM) + .values(display_xref_id=None, description=None) + .where( + GeneORM.display_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + logging.info( + f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)" + ) + + counts["external_synonym"] = core_dbi.execute( + delete(ExternalSynonymORM).where( + ExternalSynonymORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["identity_xref"] = core_dbi.execute( + delete(IdentityXrefCORM).where( + IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id, + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["object_xref"] = core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + + MasterXref = aliased(XrefCORM) + DependentXref = aliased(XrefCORM) + + query = select( + ObjectXrefCORM.object_xref_id, + DependentXrefCORM.master_xref_id, + DependentXrefCORM.dependent_xref_id, + ).where( + ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id, + MasterXref.xref_id == DependentXrefCORM.master_xref_id, + DependentXref.xref_id == DependentXrefCORM.dependent_xref_id, + MasterXref.external_db_id == external_db_id, + ) + for row in core_dbi.execute(query).mappings().all(): + counts["master_dependent_xref"] += core_dbi.execute( + delete(DependentXrefCORM).where( + DependentXrefCORM.master_xref_id == row.master_xref_id, + DependentXrefCORM.dependent_xref_id == row.dependent_xref_id, + ) + ).rowcount + counts["master_object_xref"] += core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.object_xref_id == row.object_xref_id + ) + ).rowcount + + counts["dependent_xref"] = core_dbi.execute( + delete(DependentXrefCORM).where( + DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["xref"] = core_dbi.execute( + delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id) + ).rowcount + counts["unmapped_object"] = core_dbi.execute( + delete(UnmappedObjectORM).where( + UnmappedObjectORM.unmapped_object_type == "xref", + UnmappedObjectORM.external_db_id == external_db_id, + ) + ).rowcount + + logging.info( + f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects" + ) + + def get_analysis(self, dbi: Connection) -> Dict[str, int]: + analysis_ids = {} + type_to_logic_name = { + "Gene": "xrefexoneratedna", + "Transcript": "xrefexoneratedna", + "Translation": "xrefexonerateprotein", + } + + for object_type in ["Gene", "Transcript", "Translation"]: + logic_name = type_to_logic_name[object_type] + analysis_ids[object_type] = self.get_single_analysis(logic_name, dbi) + + analysis_ids["checksum"] = self.get_single_analysis("xrefchecksum", dbi) + + return analysis_ids + + def get_single_analysis(self, logic_name: str, dbi: Connection) -> int: + analysis_id = dbi.execute( + select(AnalysisORM.analysis_id).where(AnalysisORM.logic_name == logic_name) + ).scalar() + + if not analysis_id: + Session = sessionmaker(self.core()) + with Session.begin() as session: + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + analysis_object = AnalysisORM(logic_name=logic_name, created=now) + session.add(analysis_object) + session.flush() + analysis_id = analysis_object.analysis_id + + return analysis_id + + def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: + xref_id = args["xref_id"] + accession = args["accession"] + external_db_id = args["external_db_id"] + label = args["label"] + description = args["description"] + version = args["version"] + info_type = args["info_type"] + info_text = args["info_text"] + + new_xref_id = dbi.execute( + select(XrefCORM.xref_id).where( + XrefCORM.dbprimary_acc == accession, + XrefCORM.external_db_id == external_db_id, + XrefCORM.info_type == info_type, + XrefCORM.info_text == info_text, + XrefCORM.version == version, + ) + ).scalar() + + if not new_xref_id: + dbi.execute( + insert(XrefCORM).values( + xref_id=xref_id + offset, + external_db_id=external_db_id, + dbprimary_acc=accession, + display_label=label, + version=version, + description=description, + info_type=info_type, + info_text=info_text, + ) + ) + + return xref_id + else: + return int(new_xref_id) - offset + + def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: + object_xref_id = args["object_xref_id"] + ensembl_id = args["ensembl_id"] + ensembl_type = args["ensembl_type"] + xref_id = args["xref_id"] + analysis_id = args["analysis_id"] + + new_object_xref_id = dbi.execute( + select(ObjectXrefCORM.object_xref_id).where( + ObjectXrefCORM.xref_id == xref_id, + ObjectXrefCORM.ensembl_object_type == ensembl_type, + ObjectXrefCORM.ensembl_id == ensembl_id, + ObjectXrefCORM.analysis_id == analysis_id, + ) + ).scalar() + + if not new_object_xref_id: + dbi.execute( + insert(ObjectXrefCORM).values( + object_xref_id=object_xref_id + offset, + ensembl_id=ensembl_id, + ensembl_object_type=ensembl_type, + xref_id=xref_id, + analysis_id=analysis_id, + ) + ) + + return object_xref_id + else: + return int(new_object_xref_id) - offset + + def update_unmapped_xrefs(self, dbi: Connection) -> None: + logging.info("Updating unmapped xrefs in xref DB") + + # Direct xrefs + query = ( + select(XrefUORM.xref_id) + .outerjoin(ObjectXrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) + .where( + XrefUORM.source_id == SourceUORM.source_id, + XrefUORM.dumped == None, + ObjectXrefUORM.ox_status != "FAILED_PRIORITY", + XrefUORM.info_type == "DIRECT", + ) + ) + result = dbi.execute(query).fetchall() + xref_ids = [row[0] for row in result] + dbi.execute( + update(XrefUORM) + .values(dumped="UNMAPPED_NO_STABLE_ID") + .where(XrefUORM.xref_id.in_(xref_ids)) + ) + + # Misc xrefs + dbi.execute( + update(XrefUORM) + .values(dumped="UNMAPPED_NO_MAPPING") + .where( + XrefUORM.source_id == SourceUORM.source_id, + XrefUORM.dumped == None, + XrefUORM.info_type == "MISC", + ) + ) + + # Dependent xrefs + MasterXref = aliased(XrefUORM) + DependentXref = aliased(XrefUORM) + query = ( + select(DependentXref.xref_id) + .outerjoin( + DependentXrefUORM, + DependentXrefUORM.dependent_xref_id == DependentXref.xref_id, + ) + .outerjoin(ObjectXrefUORM, ObjectXrefUORM.xref_id == DependentXref.xref_id) + .where( + DependentXref.source_id == SourceUORM.source_id, + DependentXrefUORM.master_xref_id == MasterXref.xref_id, + DependentXref.dumped == None, + ObjectXrefUORM.ox_status != "FAILED_PRIORITY", + DependentXref.info_type == "DEPENDENT", + ) + ) + result = dbi.execute(query).fetchall() + xref_ids = [row[0] for row in result] + dbi.execute( + update(XrefUORM) + .values(dumped="UNMAPPED_MASTER_FAILED") + .where(XrefUORM.xref_id.in_(xref_ids)) + ) + + # Sequence match + query = ( + select(XrefUORM.xref_id) + .outerjoin(ObjectXrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) + .outerjoin( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .where( + XrefUORM.source_id == SourceUORM.source_id, + XrefUORM.xref_id == PrimaryXrefORM.xref_id, + XrefUORM.dumped == None, + XrefUORM.info_type == "SEQUENCE_MATCH", + ) + ) + result = dbi.execute(query).fetchall() + xref_ids = [row[0] for row in result] + dbi.execute( + update(XrefUORM) + .values(dumped="UNMAPPED_NO_MAPPING") + .where(XrefUORM.xref_id.in_(xref_ids)) + ) + + # Dependents with non existent masters (none on time of loading) + dbi.execute( + update(XrefUORM) + .values(dumped="UNMAPPED_NO_MASTER") + .where( + XrefUORM.source_id == SourceUORM.source_id, + XrefUORM.dumped == None, + XrefUORM.info_type == "DEPENDENT", + ) + ) diff --git a/src/python/ensembl/production/xrefs/mappers/__init__.py b/src/python/ensembl/production/xrefs/mappers/__init__.py new file mode 100644 index 000000000..a3a8b1334 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref mappers modules.""" diff --git a/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py new file mode 100644 index 000000000..b97b858c7 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py @@ -0,0 +1,91 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base method module for handling checksums.""" + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +import hashlib + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper +from sqlalchemy.engine import Connection +from typing import List, Dict, Any + +DEFAULT_BATCH_SIZE = 1000 +DEFAULT_LOG_SIZE = 10000 + + +class ChecksumBasic: + def __init__(self, args: Dict[str, Any] = None) -> None: + if args is None: + args = {} + + self._mapper = args.get("MAPPER") + if args.get("BATCH_SIZE"): + self._batch_size = args["BATCH_SIZE"] + else: + self._batch_size = DEFAULT_BATCH_SIZE + + def mapper(self, mapper: BasicMapper = None) -> BasicMapper: + if mapper: + self._mapper = mapper + + return self._mapper + + def batch_size(self, batch_size: int = None) -> int: + if batch_size: + self._batch_size = batch_size + + return self._batch_size + + def run(self, target: str, source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]: + results, tmp_list = [], [] + count, total_count = 0, 0 + batch_size = self.batch_size() + + for record in SeqIO.parse(target, "fasta"): + tmp_list.append(record) + count += 1 + + if (count % batch_size) == 0: + res = self.perform_mapping(tmp_list, source_id, object_type, dbi) + for row in res: + results.append(row) + + total_count += count + if total_count % DEFAULT_LOG_SIZE: + self.mapper().log_progress( + f"Finished batch mapping of {total_count} sequences" + ) + count = 0 + tmp_list.clear() + + # Final mapping if there were some left over + if len(tmp_list) > 0: + self.mapper().log_progress( + f"Finished batch mapping of {total_count} sequences" + ) + res = self.perform_mapping(tmp_list, source_id, object_type, dbi) + for row in res: + results.append(row) + tmp_list.clear() + + return results + + def md5_checksum(self, sequence: Seq) -> str: + digest = hashlib.md5() + digest.update(sequence.encode()) + + return digest.hexdigest() diff --git a/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py new file mode 100644 index 000000000..993753cd6 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py @@ -0,0 +1,48 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base method module for handling mysql checksums.""" + +from ensembl.production.xrefs.mappers.methods.ChecksumBasic import * + +from sqlalchemy import select +from ensembl.xrefs.xref_source_db_model import ChecksumXref as ChecksumXrefSORM + + +class MySQLChecksum(ChecksumBasic): + def perform_mapping(self, sequences: List[SeqRecord], source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]: + final_results = [] + + for sequence in sequences: + checksum = self.md5_checksum(str(sequence.seq)).upper() + upi = None + + query = select(ChecksumXrefSORM.accession).where( + ChecksumXrefSORM.checksum == checksum, + ChecksumXrefSORM.source_id == source_id, + ) + for row in dbi.execute(query).mappings().all(): + local_upi = row.accession + if upi: + raise LookupError( + f"The sequence {sequence.id} had a checksum of {checksum} but this resulted in more than one UPI: [{upi}, {local_upi}]" + ) + upi = local_upi + + if upi: + final_results.append( + {"id": sequence.id, "upi": upi, "object_type": object_type} + ) + + return final_results diff --git a/src/python/ensembl/production/xrefs/mappers/methods/__init__.py b/src/python/ensembl/production/xrefs/mappers/methods/__init__.py new file mode 100644 index 000000000..33a2087e1 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/methods/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref mapper methods modules.""" diff --git a/src/python/ensembl/production/xrefs/mappers/species/__init__.py b/src/python/ensembl/production/xrefs/mappers/species/__init__.py new file mode 100644 index 000000000..9685c28ca --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref species-specific mapper modules.""" diff --git a/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py new file mode 100644 index 000000000..3a2b20dbd --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py @@ -0,0 +1,39 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species aedes_aegypti.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class aedes_aegypti(BasicMapper): + def gene_description_sources(self) -> List[str]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_External_Description", + ] + + return sources_list + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_External_Description", + ] + + ignore_queries = {} + + return sources_list, ignore_queries diff --git a/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py new file mode 100644 index 000000000..46e30cf99 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py @@ -0,0 +1,42 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species anopheles_gambiae.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class anopheles_gambiae(BasicMapper): + def gene_description_sources(self) -> List[str]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_RNA_Description", + ] + + return sources_list + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_RNA_Description", + ] + + ignore_queries = {} + + return sources_list, ignore_queries + + def gene_description_filter_regexps(self) -> List[str]: + return [] diff --git a/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py new file mode 100644 index 000000000..36a5f6696 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py @@ -0,0 +1,49 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species culex_quinquefasciatus.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class culex_quinquefasciatus(BasicMapper): + def gene_description_sources(self) -> List[str]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_RNA_Description", + "VB_External_Description", + ] + + return sources_list + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_RNA_Description", + "VB_External_Description", + ] + + ignore_queries = {} + + return sources_list, ignore_queries + + def gene_description_filter_regexps(self) -> List[str]: + return [] + + def no_source_label_list(self) -> List[str]: + sources_list = ["VB_RNA_Description", "VB_External_Description"] + + return sources_list diff --git a/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py new file mode 100644 index 000000000..3a2b155ec --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py @@ -0,0 +1,30 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species danio_rerio.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * +from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs + + +class danio_rerio(BasicMapper): + def set_display_xrefs(self) -> None: + display = DisplayXrefs(self) + display.set_display_xrefs_from_stable_table() + + def official_name(self) -> str: + return "ZFIN_ID" + + def set_transcript_names(self) -> None: + return None diff --git a/src/python/ensembl/production/xrefs/mappers/species/drosophila.py b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py new file mode 100644 index 000000000..2e327a735 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py @@ -0,0 +1,44 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species drosophila.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class drosophila(BasicMapper): + def gene_description_filter_regexps(self) -> List[str]: + return [] + + def gene_description_sources(self) -> List[str]: + sources_list = ["FlyBaseName_gene", "FlyBaseCGID_gene"] + + return sources_list + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["FlyBaseName_transcript", "FlyBaseCGID_transcript"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["FlyBaseName_gene", "FlyBaseCGID_gene", "flybase_gene_id"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def set_transcript_names(self) -> None: + return None diff --git a/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py new file mode 100644 index 000000000..1791da9c5 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py @@ -0,0 +1,277 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species eukaryota.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class eukaryota(BasicMapper): + def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "TAIR_SYMBOL", + "RFAM", + "RNAMMER", + "TRNASCAN_SE", + "Uniprot_gn", + "ENA_GENE", + "BROAD_U_maydis", + "BROAD_F_oxysporum", + "BROAD_G_zeae", + "BROAD_G_moniliformis", + "BROAD_P_infestans", + "phyra_jgi_v1.1", + "physo1_jgi_v1.1", + "phatr_jgi_v2", + "phatr_jgi_v2_bd", + "PGD_GENE", + "Mycgr3_jgi_v2.0_gene", + "BROAD_Magnaporthe_DB", + "PHYTOZOME_GMAX_GENE", + ] + + ignore_queries = {} + + # Ignore EntrezGene labels dependent on predicted RefSeqs + MasterXref = aliased(XrefUORM) + DependentXref = aliased(XrefUORM) + MasterSource = aliased(SourceUORM) + DependentSource = aliased(SourceUORM) + + query = select(ObjectXrefUORM.object_xref_id.distinct()).where( + ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, + ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id, + DependentXrefUORM.dependent_xref_id == DependentXref.xref_id, + DependentXrefUORM.master_xref_id == MasterXref.xref_id, + MasterXref.source_id == MasterSource.source_id, + DependentXref.source_id == DependentSource.source_id, + MasterSource.name.like("Refseq%predicted"), + DependentSource.name.like("EntrezGene"), + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + ignore_queries["EntrezGene"] = query + + query = ( + select(ObjectXrefUORM.object_xref_id) + .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) + .join(SourceUORM, SourceUORM.source_id == XrefUORM.source_id) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + XrefUORM.label.regexp_match("^LOC[[:digit:]]+"), + ) + ) + ignore_queries["LOC_prefix"] = query + + return sources_list, ignore_queries + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "RFAM", + "RNAMMER", + "TRNASCAN_SE", + "Uniprot_gn_trans_name", + "ENA_GENE", + "BROAD_U_maydis", + "BROAD_F_oxysporum", + "BROAD_G_zeae", + "BROAD_G_moniliformis", + "BROAD_P_infestans", + "phyra_jgi_v1.1", + "physo1_jgi_v1.1", + "phatr_jgi_v2", + "phatr_jgi_v2_bd", + "PGD_GENE", + "Mycgr3_jgi_v2.0_gene", + "BROAD_Magnaporthe_DB", + "PHYTOZOME_GMAX_GENE", + ] + + ignore_queries = {} + + # Ignore EntrezGene labels dependent on predicted RefSeqs + MasterXref = aliased(XrefUORM) + DependentXref = aliased(XrefUORM) + MasterSource = aliased(SourceUORM) + DependentSource = aliased(SourceUORM) + + query = select(ObjectXrefUORM.object_xref_id.distinct()).where( + ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, + ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id, + DependentXrefUORM.dependent_xref_id == DependentXref.xref_id, + DependentXrefUORM.master_xref_id == MasterXref.xref_id, + MasterXref.source_id == MasterSource.source_id, + DependentXref.source_id == DependentSource.source_id, + MasterSource.name.like("Refseq%predicted"), + DependentSource.name.like("EntrezGene"), + ObjectXrefUORM.ox_status == "DUMP_OUT", + ) + ignore_queries["EntrezGene"] = query + + query = ( + select(ObjectXrefUORM.object_xref_id) + .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) + .join(SourceUORM, SourceUORM.source_id == XrefUORM.source_id) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + XrefUORM.label.regexp_match("^LOC[[:digit:]]+"), + ) + ) + ignore_queries["LOC_prefix"] = query + + return sources_list, ignore_queries + + def gene_description_sources(self) -> List[str]: + sources_list = [ + "TAIR_LOCUS", + "PomBase_GENE", + "PomBase_TRANSCRIPT", + "Uniprot/SWISSPROT", + "Uniprot/SPTREMBL", + "BROAD_U_maydis", + "BROAD_F_oxysporum", + "BROAD_G_zeae", + "BROAD_G_moniliformis", + "BROAD_P_infestans", + "phyra_jgi_v1.1", + "physo1_jgi_v1.1", + "phatr_jgi_v2", + "phatr_jgi_v2_bd", + "PGD_GENE", + "BROAD_Magnaporthe_DB", + "PGSC_GENE", + "PHYTOZOME_GMAX_GENE", + "RFAM", + "TRNASCAN_SE", + "RNAMMER", + ] + + return sources_list + + def set_transcript_names(self) -> None: + logging.info("Assigning transcript names from gene names") + + core_dbi = self.core().connect() + + # Reset transcript display xrefs + core_dbi.execute(update(TranscriptORM).values(display_xref_id=None)) + + # Get the max xref and object_xref IDs + xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() + xref_id = int(xref_id) + object_xref_id = core_dbi.execute( + select(func.max(ObjectXrefCORM.object_xref_id)) + ).scalar() + object_xref_id = int(object_xref_id) + + # Get all genes with set display_xref_id + query = select( + GeneORM.gene_id, + ExternalDbORM.db_name, + XrefCORM.dbprimary_acc, + XrefCORM.display_label, + XrefCORM.description, + ).where( + GeneORM.display_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == ExternalDbORM.external_db_id, + ) + for row in core_dbi.execute(query).mappings().all(): + # Get the ID of transcript name external DB + external_db_id = core_dbi.execute( + select(ExternalDbORM.external_db_id).where( + ExternalDbORM.db_name.like(f"{row.db_name}_trans_name") + ) + ).scalar() + + if not external_db_id: + raise LookupError( + f"No external_db_id found for '{row.db_name}_trans_name'" + ) + + # Get transcripts related to current gene + query = ( + select(TranscriptORM.transcript_id) + .where(TranscriptORM.gene_id == row.gene_id) + .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end) + ) + for transcript_row in core_dbi.execute(query).mappings().all(): + object_xref_id += 1 + + # Check if xref already exists + insert_xref_id = core_dbi.execute( + select(XrefCORM.xref_id).where( + XrefCORM.external_db_id == external_db_id, + XrefCORM.display_label == row.display_label, + XrefCORM.version == 0, + XrefCORM.description == row.description, + XrefCORM.info_type == "MISC", + XrefCORM.info_text == "via gene name", + ) + ).scalar() + + if not insert_xref_id: + xref_id += 1 + + # Insert new xref + core_dbi.execute( + insert(XrefCORM) + .values( + xref_id=xref_id, + external_db_id=external_db_id, + dbprimary_acc=row.display_label, + display_label=row.display_label, + version=0, + description=row.description, + info_type="MISC", + info_text="via gene name", + ) + .prefix_with("IGNORE") + ) + + insert_xref_id = xref_id + + # Insert object xref + core_dbi.execute( + insert(ObjectXrefCORM).values( + object_xref_id=object_xref_id, + ensembl_id=transcript_row.transcript_id, + ensembl_object_type="Transcript", + xref_id=insert_xref_id, + ) + ) + + # Set transcript dispay xref + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=insert_xref_id) + .where(TranscriptORM.transcript_id == transcript_row.transcript_id) + ) + + ext += 1 + + # Delete object xrefs with no matching xref + query = ( + select(ObjectXrefCORM.object_xref_id) + .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id) + .where(XrefCORM.xref_id == None) + ) + result = core_dbi.execute(query).fetchall() + object_xref_ids = [row[0] for row in result] + + core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.object_xref_id.in_(object_xref_ids) + ) + ) + + core_dbi.close() diff --git a/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py new file mode 100644 index 000000000..616bd7326 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py @@ -0,0 +1,29 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species homo_sapiens.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class homo_sapiens(BasicMapper): + def official_name(self) -> str: + return "HGNC" + + def set_transcript_names(self) -> None: + return None + + def set_display_xrefs(self) -> None: + display = DisplayXrefs(self) + display.set_display_xrefs_from_stable_table() diff --git a/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py new file mode 100644 index 000000000..5861e03a7 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py @@ -0,0 +1,42 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species ixodes_scapularis.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class ixodes_scapularis(BasicMapper): + def gene_description_sources(self) -> List[str]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_External_Description", + ] + + return sources_list + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = [ + "VB_Community_Annotation", + "Uniprot/SWISSPROT", + "VB_External_Description", + ] + + ignore_queries = {} + + return sources_list, ignore_queries + + def gene_description_filter_regexps(self) -> List[str]: + return [] diff --git a/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py new file mode 100644 index 000000000..cde22b34f --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py @@ -0,0 +1,29 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species mus_musculus.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class mus_musculus(BasicMapper): + def official_name(self) -> str: + return "MGI" + + def set_transcript_names(self) -> None: + return None + + def set_display_xrefs(self) -> None: + display = DisplayXrefs(self) + display.set_display_xrefs_from_stable_table() diff --git a/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py new file mode 100644 index 000000000..df2bb072c --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py @@ -0,0 +1,33 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species neurospora_crassa.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class neurospora_crassa(BasicMapper): + def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["Uniprot_gn"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["Uniprot_gn"] + + ignore_queries = {} + + return sources_list, ignore_queries diff --git a/src/python/ensembl/production/xrefs/mappers/species/parasite.py b/src/python/ensembl/production/xrefs/mappers/species/parasite.py new file mode 100644 index 000000000..408d84d08 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/parasite.py @@ -0,0 +1,46 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species parasite.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class parasite(BasicMapper): + def set_transcript_names(self) -> None: + return None + + def gene_description_sources(self) -> List[str]: + sources_list = [ + "RFAM", + "RNAMMER", + "TRNASCAN_SE", + "miRBase", + "HGNC", + "IMGT/GENE_DB", + "Uniprot/SWISSPROT", + "RefSeq_peptide", + "Uniprot/SPTREMBL", + ] + + return sources_list + + def gene_description_filter_regexps(self) -> List[str]: + regex = [ + r"^Uncharacterized protein\s*", + r"^Putative uncharacterized protein\s*", + r"^Hypothetical protein\s*", + ] + + return regex diff --git a/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py new file mode 100644 index 000000000..53925875d --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py @@ -0,0 +1,29 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species rattus_norvegicus.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class rattus_norvegicus(BasicMapper): + def official_name(self) -> str: + return "RGD" + + def set_transcript_names(self) -> None: + return None + + def set_display_xrefs(self) -> None: + display = DisplayXrefs(self) + display.set_display_xrefs_from_stable_table() diff --git a/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py new file mode 100644 index 000000000..707dcc7db --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py @@ -0,0 +1,41 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species saccharomyces_cerevisiae.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class saccharomyces_cerevisiae(BasicMapper): + def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["SGD_GENE"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["SGD_TRANSCRIPT"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def gene_description_sources(self) -> List[str]: + sources_list = ["SGD_GENE"] + + return sources_list + + def gene_description_filter_regexps(self) -> List[str]: + return [] diff --git a/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py new file mode 100644 index 000000000..742f1207c --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py @@ -0,0 +1,131 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species sars_cov_2.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class sars_cov_2(BasicMapper): + def set_transcript_names(self) -> None: + logging.info("Assigning transcript names from gene names") + + core_dbi = self.core().connect() + + # Reset transcript display xrefs + core_dbi.execute(update(TranscriptORM).values(display_xref_id=None)) + + # Get the max xref and object_xref IDs + xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() + xref_id = int(xref_id) + object_xref_id = core_dbi.execute( + select(func.max(ObjectXrefCORM.object_xref_id)) + ).scalar() + object_xref_id = int(object_xref_id) + + # Delete transcript name xrefs + core_dbi.execute( + delete(XrefCORM).where( + XrefCORM.xref_id == ObjectXrefCORM.xref_id, + ObjectXrefCORM.ensembl_object_type == "Transcript", + ExternalDbORM.external_db_id == XrefCORM.external_db_id, + ExternalDbORM.db_name.like("%_trans_name"), + ) + ) + + # Get all genes with set display_xref_id + query = select( + GeneORM.gene_id, + ExternalDbORM.db_name, + XrefCORM.dbprimary_acc, + XrefCORM.display_label, + XrefCORM.description, + ).where( + GeneORM.display_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == ExternalDbORM.external_db_id, + ) + for row in core_dbi.execute(query).mappings().all(): + # Get the ID of transcript name external DB + external_db_id = core_dbi.execute( + select(ExternalDbORM.external_db_id).where( + ExternalDbORM.db_name.like(f"{row.db_name}_trans_name") + ) + ).scalar() + + if not external_db_id: + raise LookupError( + f"No external_db_id found for '{row.db_name}_trans_name'" + ) + + # Get transcripts related to current gene + query = ( + select(TranscriptORM.transcript_id) + .where(TranscriptORM.gene_id == row.gene_id) + .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end) + ) + for transcript_row in core_dbi.execute(query).mappings().all(): + xref_id += 1 + object_xref_id += 1 + + info_text = f"via gene {row.dbprimary_acc}" + + # Insert new xref + core_dbi.execute( + insert(XrefCORM) + .values( + xref_id=xref_id, + external_db_id=external_db_id, + dbprimary_acc=row.display_label, + display_label=row.display_label, + version=0, + description=row.description, + info_type="MISC", + info_text=info_text, + ) + .prefix_with("IGNORE") + ) + + # Insert object xref + core_dbi.execute( + insert(ObjectXrefCORM).values( + object_xref_id=object_xref_id, + ensembl_id=transcript_row.transcript_id, + ensembl_object_type="Transcript", + xref_id=xref_id, + ) + ) + + # Set transcript display xref + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=xref_id) + .where(TranscriptORM.transcript_id == transcript_row.transcript_id) + ) + + # Delete object xrefs with no matching xref + query = ( + select(ObjectXrefCORM.object_xref_id) + .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id) + .where(XrefCORM.xref_id == None) + ) + result = core_dbi.execute(query).fetchall() + object_xref_ids = [row[0] for row in result] + + core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.object_xref_id.in_(object_xref_ids) + ) + ) + + core_dbi.close() diff --git a/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py new file mode 100644 index 000000000..8c7d66d8e --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py @@ -0,0 +1,41 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species schizosaccharomyces_pombe.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class schizosaccharomyces_pombe(BasicMapper): + def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["PomBase_GENE"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: + sources_list = ["PomBase_TRANSCRIPT"] + + ignore_queries = {} + + return sources_list, ignore_queries + + def gene_description_sources(self) -> List[str]: + sources_list = ["PomBase_GENE"] + + return sources_list + + def gene_description_filter_regexps(self) -> List[str]: + return [] diff --git a/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py new file mode 100644 index 000000000..a3182e7f7 --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py @@ -0,0 +1,29 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species sus_scrofa.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class sus_scrofa(BasicMapper): + def official_name(self) -> str: + return "PIGGY" + + def set_transcript_names(self) -> None: + return None + + def set_display_xrefs(self) -> None: + display = DisplayXrefs(self) + display.set_display_xrefs_from_stable_table() diff --git a/src/python/ensembl/production/xrefs/mappers/species/wormbase.py b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py new file mode 100644 index 000000000..796d6260e --- /dev/null +++ b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py @@ -0,0 +1,124 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mapper extension module for species wormbase.""" + +from ensembl.production.xrefs.mappers.BasicMapper import * + + +class wormbase(BasicMapper): + def set_display_xrefs(self) -> None: + logging.info( + "Building Transcript and Gene display_xrefs using WormBase direct xrefs" + ) + + core_dbi = self.core().connect() + + external_dbs, gene_display_xrefs, transcript_display_xrefs = {}, {}, {} + + # Get external_db IDs for the sources we are interested in + query = select(ExternalDbORM.external_db_id, ExternalDbORM.db_name).where( + ExternalDbORM.db_name.like("wormbase%") + ) + for row in core_dbi.execute(query).mappings().all(): + external_dbs[row.db_name] = row.external_db_id + + if not external_dbs.get("wormbase_transcript") or not external_dbs.get( + "wormbase_locus" + ): + logging.debug( + "Could not find wormbase_transcript and wormbase_locus in external_db table, so doing nothing" + ) + + xref_dbi.close() + core_dbi.close() + + return + + # Get genes with wormbase display xrefs + query = select(ObjectXrefCORM.ensembl_id, XrefCORM.xref_id).where( + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_dbs["wormbase_gseqname"], + ) + for row in core_dbi.execute(query).mappings().all(): + gene_display_xrefs[row.ensembl_id] = row.xref_id + + # Some genes will have a locus name. Overwrite display xrefs for those that do + query = select(ObjectXrefCORM.ensembl_id, XrefCORM.xref_id).where( + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_dbs["wormbase_locus"], + ) + for row in core_dbi.execute(query).mappings().all(): + gene_display_xrefs[row.ensembl_id] = row.xref_id + + # Get the wormbase_transcript xrefs for the genes + query = select(ObjectXrefCORM.ensembl_id, XrefCORM.xref_id).where( + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_dbs["wormbase_transcript"], + ) + for row in core_dbi.execute(query).mappings().all(): + transcript_display_xrefs[row.ensembl_id] = row.xref_id + + # Reset gene and transcript display xrefs + core_dbi.execute(update(GeneORM).values(display_xref_id=None)) + core_dbi.execute(update(TranscriptORM).values(display_xref_id=None)) + + # Now update + for gene_id, xref_id in gene_display_xrefs.items(): + core_dbi.execute( + update(GeneORM) + .values(display_xref_id=xref_id) + .where(GeneORM.gene_id == gene_id) + ) + + for transcript_id, xref_id in gene_display_xrefs.items(): + core_dbi.execute( + update(TranscriptORM) + .values(display_xref_id=xref_id) + .where(TranscriptORM.transcript_id == transcript_id) + ) + + core_dbi.close() + + logging.info("Updated display xrefs in core for genes and transcripts") + + def set_transcript_names(self) -> None: + return None + + def gene_description_sources(self) -> List[str]: + sources_list = [ + "RFAM", + "RNAMMER", + "TRNASCAN_SE", + "miRBase", + "HGNC", + "IMGT/GENE_DB", + "Uniprot/SWISSPROT", + "RefSeq_peptide", + "Uniprot/SPTREMBL", + ] + + return sources_list + + def gene_description_filter_regexps(self) -> List[str]: + regex = [ + r"^(Protein \S+\s*)+$", + r"^Uncharacterized protein\s*\S+\s*", + r"^Uncharacterized protein\s*", + r"^Putative uncharacterized protein\s*\S+\s*", + r"^Putative uncharacterized protein\s*", + r"^Hypothetical protein\s*\S+\s*", + ] + + return regex diff --git a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py new file mode 100644 index 000000000..988b92ffa --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py @@ -0,0 +1,161 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for ArrayExpress source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class ArrayExpressParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + species_name = args["species_name"] + file = args["file"] + dba = args["dba"] + ensembl_release = args["ensembl_release"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Extract db connection parameters from file name + project, db_user, db_host, db_port, db_name, db_pass = ( + self.extract_params_from_string( + file, ["project", "user", "host", "port", "dbname", "pass"] + ) + ) + if not db_user: + db_user = "ensro" + if not db_port: + db_port = "3306" + + # Get the species name(s) + species_id_to_names = self.species_id_to_names(xref_dbi) + if species_name: + species_id_to_names.setdefault(species_id, []).append(species_name) + + if not species_id_to_names.get(species_id): + return 0, "Skipped. Could not find species ID to name mapping" + names = species_id_to_names[species_id] + + # Look up the species in ftp server and check if active + species_lookup = self._get_species() + active = self._is_active(species_lookup, names, verbose) + if not active: + return 0, "Skipped. ArrayExpress source not active for species" + + species_name = species_id_to_names[species_id][0] + + # Connect to the appropriate arrayexpress db + if db_host: + arrayexpress_db_url = URL.create( + "mysql", db_user, db_pass, db_host, db_port, db_name + ) + elif project and project == "ensembl": + if verbose: + logging.info("Looking for db in mysql-ens-sta-1") + registry = "ensro@mysql-ens-sta-1:4519" + arrayexpress_db_url = self.get_db_from_registry( + species_name, "core", ensembl_release, registry + ) + elif project and project == "ensemblgenomes": + if verbose: + logging.info( + "Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2" + ) + registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160" + arrayexpress_db_url = self.get_db_from_registry( + species_name, "core", ensembl_release, registry + ) + + if not arrayexpress_db_url: + registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275" + arrayexpress_db_url = self.get_db_from_registry( + species_name, "core", ensembl_release, registry + ) + elif dba: + arrayexpress_db_url = dba + else: + arrayexpress_db_url = None + + if not arrayexpress_db_url: + raise IOError( + f"Could not find ArrayExpress DB. Missing or unsupported project value. Supported values: ensembl, ensemblgenomes." + ) + else: + if verbose: + logging.info(f"Found ArrayExpress DB: {arrayexpress_db_url}") + + xref_count = 0 + + db_engine = self.get_db_engine(arrayexpress_db_url) + with db_engine.connect() as arrayexpress_dbi: + query = select(GeneORM.stable_id).where( + GeneORM.biotype != "LRG_gene", GeneORM.is_current == 1 + ) + result = arrayexpress_dbi.execute(query).mappings().all() + + # Add direct xref for every current gene found + for row in result: + xref_id = self.add_xref( + { + "accession": row.stable_id, + "label": row.stable_id, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, row.stable_id, "gene", "", xref_dbi) + + xref_count += 1 + + result_message = f"Added {xref_count} DIRECT xrefs" + + return 0, result_message + + def _get_species(self) -> Dict[str, int]: + ftp_server = "ftp.ebi.ac.uk" + ftp_dir = "pub/databases/microarray/data/atlas/bioentity_properties/ensembl" + + species_lookup = {} + + ftp = FTP(ftp_server) + ftp.login("anonymous", "-anonymous@") + ftp.cwd(ftp_dir) + remote_files = ftp.nlst() + ftp.close() + + for file in remote_files: + species = file.split(".")[0] + species_lookup[species] = 1 + + return species_lookup + + def _is_active(self, species_lookup: Dict[str, int], names: List[str], verbose: bool) -> bool: + # Loop through the names and aliases first. If we get a hit then great + active = False + for name in names: + if species_lookup.get(name): + if verbose: + logging.info( + f"Found ArrayExpress has declared the name {name}. This was an alias" + ) + active = True + break + + return active diff --git a/src/python/ensembl/production/xrefs/parsers/BaseParser.py b/src/python/ensembl/production/xrefs/parsers/BaseParser.py new file mode 100644 index 000000000..3ae7c2e2c --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/BaseParser.py @@ -0,0 +1,972 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base xref parser module to include all common functions used by xref parsers.""" + +from ensembl.production.xrefs.Base import * + + +class BaseParser(Base): + """Class to represent the base of xref parser modules. Inherits the xref Base class.""" + + def __init__(self, testing: bool = False) -> None: + if not testing: + super().__init__() + + self._direct_xref_tables = { + "gene": GeneDirectXrefORM, + "transcript": TranscriptDirectXrefORM, + "translation": TranslationDirectXrefORM, + } + self._xref_dependent_mapped = {} + + def get_source_id_for_source_name(self, source_name: str, dbi: Connection, priority_desc: str = None) -> int: + """Retrieves a source ID from its name and priority description from a database. + + Parameters + ---------- + source_name: str + The name of the source + dbi: sqlalchemy.engine.Connection + The database connection to query in + priority_desc: str, optional + The priority description of the source (default is None) + + Returns + ------- + The source ID. + + Raises + ------ + KeyError + If no ID was found for the provided source name. + """ + low_name = source_name.lower() + + if priority_desc: + low_desc = priority_desc.lower() + query = select(SourceUORM.source_id).where( + func.lower(SourceUORM.name) == low_name, + func.lower(SourceUORM.priority_description) == low_desc, + ) + source_name = f"{source_name} ({priority_desc})" + else: + query = select(SourceUORM.source_id).where( + func.lower(SourceUORM.name) == low_name + ) + + result = dbi.execute(query) + if result: + source_id = result.scalar() + else: + raise KeyError(f"No source_id for source_name={source_name}") + + return source_id + + def get_source_name_for_source_id(self, source_id: int, dbi: Connection) -> str: + """Retrieves a source name from its ID from a database. + + Parameters + ---------- + source_id: int + The ID of the source + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + The source name. + + Raises + ------ + KeyError + If no name was found for the provided source ID. + """ + result = dbi.execute( + select(SourceUORM.name).where(SourceUORM.source_id == source_id) + ) + if result: + source_name = result.scalar() + else: + raise KeyError( + f"There is no entity with source-id {source_id} in the source-table of the xref-database. The source-id and the name of the source-id is hard-coded in populate_metadata.sql and in the parser. Couldn't get source name for source ID {source_id}" + ) + + return source_name + + def set_release(self, source_id: int, s_release: str, dbi: Connection) -> None: + """Sets the release value for a source in the source table of a database. + + Parameters + ---------- + source_id: str + The source ID + s_release: str + The release string + dbi: sqlalchemy.engine.Connection + The database connection to update in + """ + dbi.execute( + update(SourceUORM) + .where(SourceUORM.source_id == source_id) + .values(source_release=s_release) + ) + + def upload_xref_object_graphs(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None: + """Adds xref data into a database. + Uploads main xref data, related direct xrefs, dependent xrefs, and synonyms. + + Parameters + ---------- + xrefs: list + List of xrefs to upload + dbi: sqlalchemy.engine.Connection + The database connection to update in + + Raises + ------ + IOError + Failure is setting or retrieving an xref ID. + """ + count = len(xrefs) + if count: + for xref in xrefs: + if not xref.get("ACCESSION") or not xref.get("SOURCE_ID"): + continue + + # Create entry in xref table and get ID + xref_id = self.add_xref( + { + "accession": xref["ACCESSION"], + "source_id": xref["SOURCE_ID"], + "species_id": xref["SPECIES_ID"], + "label": xref.get("LABEL", xref["ACCESSION"]), + "description": xref.get("DESCRIPTION"), + "version": xref.get("VERSION", 0), + "info_type": xref.get("INFO_TYPE", "MISC"), + }, + dbi, + True, + ) + + # Add direct xrefs + if xref.get("DIRECT_XREFS"): + for direct_xref in xref["DIRECT_XREFS"]: + direct_xref_id = self.add_xref( + { + "accession": xref["ACCESSION"], + "source_id": direct_xref["SOURCE_ID"], + "species_id": xref["SPECIES_ID"], + "label": xref.get("LABEL", xref["ACCESSION"]), + "description": xref.get("DESCRIPTION"), + "version": xref.get("VERSION", 0), + "info_type": direct_xref.get("LINKAGE_TYPE"), + }, + dbi, + True, + ) + + # direct_xref_id = self.get_xref_id(xref['ACCESSION'], direct_xref['SOURCE_ID'], xref['SPECIES_ID'], dbi) + self.add_direct_xref( + direct_xref_id, + direct_xref["STABLE_ID"], + direct_xref["ENSEMBL_TYPE"], + direct_xref["LINKAGE_TYPE"], + dbi, + ) + + # Error checking + if not xref_id: + raise IOError( + "xref_id is not set for %s %s %s %s %s" + % ( + xref["ACCESSION"], + xref["LABEL"], + xref["DESCRIPTION"], + xref["SOURCE_ID"], + xref["SPECIES_ID"], + ) + ) + + # Create entry in primary_xref table with sequence; if this is a "cumulative" + # entry it may already exist, and require an UPDATE rather than an INSERT + if xref.get("SEQUENCE"): + exists = dbi.execute( + select(PrimaryXrefORM.xref_id).where( + PrimaryXrefORM.xref_id == xref_id + ) + ).scalar() + + if exists: + query = ( + update(PrimaryXrefORM) + .where(PrimaryXrefORM.xref_id == xref_id) + .values(sequence=xref["SEQUENCE"]) + ) + else: + query = insert(PrimaryXrefORM).values( + xref_id=xref_id, + sequence=xref["SEQUENCE"], + sequence_type=xref["SEQUENCE_TYPE"], + status=xref.get("STATUS"), + ) + dbi.execute(query) + + # If there are synonyms, add entries in the synonym table + if xref.get("SYNONYMS"): + for synonym in xref["SYNONYMS"]: + self.add_synonym(xref_id, synonym, dbi) + + # If there are dependent xrefs, add xrefs and dependent xrefs for them + if xref.get("DEPENDENT_XREFS"): + for dependent_xref in xref.get("DEPENDENT_XREFS"): + # Insert the xref and get its xref_id + dependent_xref_id = self.add_xref( + { + "accession": dependent_xref["ACCESSION"], + "source_id": dependent_xref["SOURCE_ID"], + "species_id": xref["SPECIES_ID"], + "label": dependent_xref.get("LABEL"), + "description": dependent_xref.get("DESCRIPTION"), + "version": dependent_xref.get("VERSION"), + "info_type": "DEPENDENT", + }, + dbi, + ) + if not dependent_xref_id: + continue + + # Add the linkage_annotation and source id it came from + self.add_dependent_xref_maponly( + dependent_xref_id, + dependent_xref["LINKAGE_SOURCE_ID"], + xref_id, + dependent_xref.get("LINKAGE_ANNOTATION"), + dbi, + ) + + # If there are synonyms, add entries in the synonym table + if dependent_xref.get("SYNONYMS"): + for synonym in dependent_xref.get("SYNONYMS"): + self.add_synonym(dependent_xref_id, synonym, dbi) + + # Add the pair data. refseq dna/pep pairs usually + if xref_id and xref.get("PAIR"): + dbi.execute( + insert(PairsORM).values( + source_id=xref["SOURCE_ID"], + accession1=xref["ACCESSION"], + accession2=xref["PAIR"], + ) + ) + + def get_xref_id(self, accession: str, source_id: int, species_id: int, dbi: Connection) -> Optional[int]: + """Retrieves the xref row ID from accession, source ID, and species ID. + + Parameters + ---------- + accession: str + The xref accession + source_id: int + The xref source ID + species_id: int + The species ID + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + The xref ID, if found (else None). + """ + xref_ids = [] + + query = select(XrefUORM.xref_id).where( + XrefUORM.accession == accession, + XrefUORM.source_id == source_id, + XrefUORM.species_id == species_id, + ) + + for row in dbi.execute(query).mappings().all(): + xref_ids.append(row.xref_id) + + if len(xref_ids) > 0: + return xref_ids[0] + + return None + + def add_direct_xref(self, general_xref_id: int, ensembl_stable_id: str, ensembl_type: str, linkage_type: str, dbi: Connection) -> None: + """Adds data into direct xref tables in a database. + + Parameters + ---------- + general_xref_id: int + The xref ID related to the direct xref + ensembl_stable_id: str + The ensEMBL stable ID related to the direct xref + ensembl_type: str + The feature type (gene, transcript, or translation) + linkage_type: str + The type of link between the xref and ensEMBL feature + dbi: sqlalchemy.engine.Connection + The database connection to update in + """ + # Check if such a mapping exists yet + existing_xref_ids = self.get_direct_xref_id( + ensembl_stable_id, ensembl_type, linkage_type, dbi + ) + if general_xref_id in existing_xref_ids: + return None + + ensembl_type = ensembl_type.lower() + dbi.execute( + insert(self._direct_xref_tables[ensembl_type]).values( + general_xref_id=general_xref_id, + ensembl_stable_id=ensembl_stable_id, + linkage_xref=linkage_type, + ) + ) + + def add_to_direct_xrefs(self, args: Dict[str, Any], dbi: Connection) -> None: + """Adds direct xref data into both the xref table and direct xref tables in a database. + This calls the functions add_xref and add_direct_xref. + + Parameters + ---------- + args: dict + The direct xref arguments. These include: + - stable_id: The ensEMBL feature stable ID + - ensembl_type: The feature type (gene, transcript, or translation) + - accession: The xref accession + - source_id: The xref source ID + - species_id: The species ID + - version (optional): The xref version (default is 0) + - label (optional): The xref label (default is the xref accession) + - description (optional): The xref description + - linkage (optional): The type of link between the xref and ensEMBL + - info_text (optional): Additional info related to the xref (default is empty string) + - info_type (optional): The type of xref being added (default is DIRECT) + dbi: sqlalchemy.engine.Connection + The database connection to update in + """ + stable_id = args["stable_id"] + ensembl_type = args["ensembl_type"] + accession = args["accession"] + source_id = args["source_id"] + species_id = args["species_id"] + version = args.get("version", 0) + label = args.get("label", accession) + description = args.get("description") + linkage = args.get("linkage") + info_text = args.get("info_text", "") + + args["info_type"] = args.get("info_type", "DIRECT") + + # If the accession already has an xref find it else cretae a new one + direct_xref_id = self.add_xref(args, dbi) + self.add_direct_xref(direct_xref_id, stable_id, ensembl_type, linkage, dbi) + + def get_direct_xref_id(self, stable_id: str, ensembl_type: str, link: str, dbi: Connection) -> int: + """Retrieves the direct xref row ID from stable ID, ensEMBL type and linkage type. + + Parameters + ---------- + stable_id: str + The ensEMBL feature stable ID + ensembl_type: str + The feature type (gene, transcript, or translation) + link: str + The type of link between the xref and ensEMBL + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + The direct xref ID(s). + """ + direct_xref_ids = [] + + ensembl_type = ensembl_type.lower() + ensembl_table = self._direct_xref_tables[ensembl_type] + query = select(ensembl_table.general_xref_id).where( + ensembl_table.ensembl_stable_id == stable_id, + ensembl_table.linkage_xref == link, + ) + + for row in dbi.execute(query).mappings().all(): + direct_xref_ids.append(row.general_xref_id) + + return direct_xref_ids + + def add_xref(self, args: Dict[str, Any], dbi: Connection, update_label_desc: bool = False) -> int: + """Adds data into xref table in a database and returns the xref ID. + This function first checks if an xref already exists with the provided data. + + Parameters + ---------- + args: dict + The direct xref arguments. These include: + - accession: The xref accession + - source_id: The xref source ID + - species_id: The species ID + - label (optional): The xref label (default is the xref accession) + - description (optional): The xref description + - version (optional): The xref version (default is 0) + - info_type (optional): The type of xref being added (default is MISC) + - info_text (optional): Additional info related to the xref (default is empty string) + dbi: sqlalchemy.engine.Connection + The database connection to update in + update_label_desc: bool, optional + If set to True, the xref label and description will be updated even if the xref data already exists in the database (default is False) + + Returns + ------- + The xref ID (existing or newly added). + """ + accession = args["accession"] + source_id = args["source_id"] + species_id = args["species_id"] + label = args.get("label", accession) + description = args.get("description") + version = args.get("version", 0) + info_type = args.get("info_type", "MISC") + info_text = args.get("info_text", "") + + # If the description is more than 255 characters, chop it off and add + # an indication that it has been truncated to the end of it. + if description and len(description) > 255: + description = description[0:249] + " /.../" + + # See if it already exists. If so return the xref_id for this one. + xref_id = self.get_xref_id(accession, source_id, species_id, dbi) + if xref_id: + if update_label_desc: + if label: + dbi.execute( + update(XrefUORM) + .where(XrefUORM.xref_id == xref_id) + .values(label=label) + ) + if description: + dbi.execute( + update(XrefUORM) + .where(XrefUORM.xref_id == xref_id) + .values(description=description) + ) + return xref_id + + # Add new xref + dbi.execute( + insert(XrefUORM).values( + accession=accession, + version=version, + label=label, + description=description, + source_id=source_id, + species_id=species_id, + info_type=info_type, + info_text=info_text, + ) + ) + + xref_id = self.get_xref_id(accession, source_id, species_id, dbi) + return xref_id + + def add_dependent_xref(self, args: Dict[str, Any], dbi: Connection) -> int: + """Adds data into the xref table and dependent xref table in a database. + + Parameters + ---------- + args: dict + The direct xref arguments. These include: + - master_xref_id: The main xref ID which the dependent xref is dependent on + - accession: The dependent xref accession + - source_id: The dependent xref source ID + - species_id: The species ID + - version (optional): The dependent xref version (default is 0) + - label (optional): The dependent xref label (default is the dependent xref accession) + - description (optional): The dependent xref description + - linkage (optional): The source ID of the main xref which the dependent xref id dependent on + - info_text (optional): Additional info related to the dependent xref (default is empty string) + - info_type (optional): The type of xref being added (default is DEPENDENT) + dbi: sqlalchemy.engine.Connection + The database connection to update in + + Returns + ------- + The dependent xref ID. + """ + master_xref_id = args["master_xref_id"] + accession = args["accession"] + source_id = args["source_id"] + species_id = args["species_id"] + version = args.get("version", 0) + label = args.get("label", accession) + description = args.get("description") + linkage = args.get("linkage") + info_text = args.get("info_text", "") + + args["info_type"] = args.get("info_type", "DEPENDENT") + + # If the accession already has an xref find it else cretae a new one + dependent_xref_id = self.add_xref(args, dbi) + self.add_dependent_xref_maponly( + dependent_xref_id, source_id, master_xref_id, linkage, dbi + ) + + return dependent_xref_id + + def add_dependent_xref_maponly(self, dependent_id: int, dependent_source_id: int, master_id: int, master_source_id: int, dbi: Connection, update_info_type: bool = False) -> None: + """Adds data into the dependent xref table in a database. + This function only adds the dependent connection if it hasn't been added before (from a cache). + + Parameters + ---------- + dependent_id: int + The dependent xref ID + dependent_source_id: int + The source ID of the dependent xref + master_id: int + The master xref ID + master_source_id: int + The source ID of the master xref + dbi: sqlalchemy.engine.Connection + The database connection to update in + update_info_type: bool, optional + If set to True, the info_type column of the xref table related to the dependent xref will be updated to 'DEPENDENT' (default is False) + """ + index = f"{master_id}|{dependent_id}" + if ( + not self._xref_dependent_mapped.get(index) + or self._xref_dependent_mapped[index] != master_source_id + ): + dbi.execute( + insert(DependentXrefUORM) + .values( + master_xref_id=master_id, + dependent_xref_id=dependent_id, + linkage_annotation=master_source_id, + linkage_source_id=dependent_source_id, + ) + .prefix_with("IGNORE") + ) + + self._xref_dependent_mapped[index] = master_source_id + + if update_info_type: + self._update_xref_info_type(dependent_id, "DEPENDENT", dbi) + + def _update_xref_info_type(self, xref_id: int, info_type: str, dbi: Connection) -> None: + """Updates the info_type column of the xref table. + + Parameters + ---------- + xref_id: int + The xref ID + info_type: str + The info type value to update + dbi: sqlalchemy.engine.Connection + The database connection to update in + """ + dbi.execute( + update(XrefUORM) + .where(XrefUORM.xref_id == xref_id) + .values(info_type=info_type) + ) + + def get_xref_sources(self, dbi: Connection) -> Dict[str, int]: + """Retrieves the xref source names and ID from a database. + + Parameters + ---------- + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'source_name' : 'source_ID'} items. + """ + sourcename_to_sourceid = {} + + query = select(SourceUORM.name, SourceUORM.source_id) + + for row in dbi.execute(query).mappings().all(): + sourcename_to_sourceid[row.name] = row.source_id + + return sourcename_to_sourceid + + def add_synonym(self, xref_id: int, synonym: str, dbi: Connection) -> None: + """Adds synonym data into the synonym table if a database. + + Parameters + ---------- + xref_id: int + The xref ID related to the synonym + synonym: str + The xref synonym + dbi: sqlalchemy.engine.Connection + The database connection to update in + """ + dbi.execute( + insert(SynonymORM) + .values(xref_id=xref_id, synonym=synonym) + .prefix_with("IGNORE") + ) + + def get_ext_synonyms(self, source_name: str, dbi: Connection) -> Dict[str, List[str]]: + """Retrieves the list of synonyms for a specific xref source. + + Parameters + ---------- + source_name: str + The xref source name + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'accession' or 'label' : [list of synonyms]} items. + """ + ext_syns = {} + seen = {} + separator = ":" + + query = ( + select(XrefUORM.accession, XrefUORM.label, SynonymORM.synonym) + .where( + XrefUORM.xref_id == SynonymORM.xref_id, + SourceUORM.source_id == XrefUORM.source_id, + ) + .filter(SourceUORM.name.like(source_name)) + ) + + count = 0 + for row in dbi.execute(query).mappings().all(): + acc_syn = row.accession + separator + row.synonym + if not seen.get(acc_syn): + ext_syns.setdefault(row.accession, []).append(row.synonym) + ext_syns.setdefault(row.label, []).append(row.synonym) + count += 1 + + seen[acc_syn] = 1 + + return ext_syns + + def build_dependent_mappings(self, source_id: int, dbi: Connection) -> None: + """Builds the dependent mappings cache for a specific xref source. + The resulting cache is a dict variable containing {'master_xref_id|dependent_xref_id' : 'linkage_annotation'} items. + + Parameters + ---------- + source_id: int + The xref source ID + dbi: sqlalchemy.engine.Connection + The database connection to query in + """ + query = select( + DependentXrefUORM.master_xref_id, + DependentXrefUORM.dependent_xref_id, + DependentXrefUORM.linkage_annotation, + ).where( + DependentXrefUORM.dependent_xref_id == XrefUORM.xref_id, + XrefUORM.source_id == source_id, + ) + + for row in dbi.execute(query).mappings().all(): + self._xref_dependent_mapped[ + row.master_xref_id + "|" + row.dependent_xref_id + ] = row.linkage_annotation + + def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]: + """Retrieves the xref accessions and IDs related to a specific xref source and species from a database. + + Parameters + ---------- + source_name: str + The xref source name + species_id: int + The species ID + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'accession' : [list of xref IDs]} items. + """ + valid_codes = {} + sources = [] + + big_name = "%" + source_name.upper() + "%" + query = select(SourceUORM.source_id).filter( + func.upper(SourceUORM.name).like(big_name) + ) + for row in dbi.execute(query).fetchall(): + sources.append(row[0]) + + for source_id in sources: + query = select(XrefUORM.accession, XrefUORM.xref_id).where( + XrefUORM.species_id == species_id, XrefUORM.source_id == source_id + ) + for row in dbi.execute(query).fetchall(): + valid_codes.setdefault(row[0], []).append(row[1]) + + return valid_codes + + def is_file_header_valid(self, columns_count: int, field_patterns: List[str], header: List[str], case_sensitive: bool = False) -> bool: + """Checks whether the provided file header is valid by checking length and column patterns. + + Parameters + ---------- + columns_count: int + The number of columns to be expected in the header + field_patterns: list + The column patterns for the header to satisfy + header: list + The file header to check + case_sensitive: bool, optional + If set to True, header fieled will be parsed as is, as opposed to lower-case (default is False) + + Returns + ------- + 1 is the header is valid. + 0 if the header is not valid. + """ + # Check number of columns + if len(header) < columns_count: + return False + + # Check column patterns + for pattern in field_patterns: + header_field = header.pop(0) + if not case_sensitive: + header_field = header_field.lower() + if pattern and not re.search(pattern, header_field): + return False + + # If we have made it this far, all should be in order + return True + + def add_to_syn(self, accession: str, source_id: int, synonym: str, species_id: int, dbi: Connection) -> None: + """Add synomyn data for an xref given its accession and source ID. + + Parameters + ---------- + accession: str + The xref accession + source_id: int + The xref source ID + synonym: str + The xref synonym + species_id: int + The species ID + dbi: sqlalchemy.engine.Connection + The database connection to update in + + Raises + ------ + KeyError + If no xref is found for accession, source ID, and species ID. + """ + xref_id = self.get_xref_id(accession, source_id, species_id, dbi) + if xref_id: + self.add_synonym(xref_id, synonym, dbi) + else: + raise KeyError( + f"Could not find acc {accession} in xref table source = {source_id} of species {species_id}" + ) + + def add_to_syn_for_mult_sources(self, accession: str, sources: List[int], synonym: str, species_id: int, dbi: Connection) -> None: + """Adds synonym data for multiple sources. + + Parameters + ---------- + accession: str + The xref accession + sources: list + List of xref sources to add synonyms for + synonym: str + The xref synonym + species_id: int + The species ID + dbi: sqlalchemy.engine.Connection + The database connection to update in + """ + for source_id in sources: + xref_id = self.get_xref_id(accession, source_id, species_id, dbi) + if xref_id: + self.add_synonym(xref_id, synonym, dbi) + + def species_id_to_names(self, dbi: Connection) -> Dict[int, List[str]]: + """Creates a dictionary that contains the name and aliases for every species ID. + + Parameters + ---------- + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'species_id' : [list of names/synonyms]} items. + """ + id_to_names = {} + + # Query the species table + query = select(SpeciesORM.species_id, SpeciesORM.name) + for row in dbi.execute(query).mappings().all(): + id_to_names[row.species_id] = [row.name] + + # Also populate the dict with all the aliases + query = select(SpeciesORM.species_id, SpeciesORM.aliases) + for row in dbi.execute(query).mappings().all(): + for name in re.split(r",\s*", row.aliases, flags=re.MULTILINE | re.DOTALL): + id_to_names.setdefault(row.species_id, []).append(name) + + return id_to_names + + def species_id_to_taxonomy(self, dbi: Connection) -> Dict[int, List[int]]: + """Creates a dictionary that contains the taxonomy IDs for every species ID. + + Parameters + ---------- + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'species_id' : [list of taxonomy IDs]} items. + """ + id_to_taxonomy = {} + + # Query the species table + query = select(SpeciesORM.species_id, SpeciesORM.taxonomy_id) + for row in dbi.execute(query).mappings().all(): + id_to_taxonomy.setdefault(row.species_id, []).append(row.taxonomy_id) + + return id_to_taxonomy + + def get_valid_xrefs_for_dependencies(self, dependent_name: str, reverse_ordered_source_list: List[str], dbi: Connection) -> Dict[str, int]: + """Get a hash to go from accession of a dependent xref to master_xref_id for all of source names given. + + Parameters + ---------- + dependent_name: str + The dependent source name + reverse_ordered_source_list: list + List of source names + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'accession' : 'master_xred_id'} items. + """ + dependent_2_xref = {} + dependent_sources = [] + sources = [] + + query = select(SourceUORM.source_id).where( + func.lower(SourceUORM.name) == dependent_name.lower() + ) + for row in dbi.execute(query).fetchall(): + dependent_sources.append(row[0]) + + for name in reverse_ordered_source_list: + query = select(SourceUORM.source_id).where( + func.lower(SourceUORM.name) == name.lower() + ) + for row in dbi.execute(query).fetchall(): + sources.append(row[0]) + + Xref1 = aliased(XrefUORM) + Xref2 = aliased(XrefUORM) + + for dependent in dependent_sources: + for source in sources: + query = select(DependentXrefUORM.master_xref_id, Xref2.accession).where( + Xref1.xref_id == DependentXrefUORM.master_xref_id, + Xref1.source_id == source, + Xref2.xref_id == DependentXrefUORM.dependent_xref_id, + Xref2.source_id == dependent, + ) + for row in dbi.execute(query).fetchall(): + dependent_2_xref[row[1]] = row[0] + + return dependent_2_xref + + def get_source_ids_for_source_name_pattern(self, source_name: str, dbi: Connection) -> List[int]: + """Gets a set of source IDs matching a source name pattern. + + Parameters + ---------- + source_name: str + The name of the source + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A list of source IDs. + """ + big_name = "%" + source_name.upper() + "%" + sources = [] + + query = select(SourceUORM.source_id).where( + func.upper(SourceUORM.name).like(big_name) + ) + for row in dbi.execute(query).fetchall(): + sources.append(row[0]) + + return sources + + def get_acc_to_label(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, str]: + """Creates a hash that uses the accession as a key and the label as the value. + + Parameters + ---------- + source_name: str + The name of the source + species_id: int + The species ID + dbi: sqlalchemy.engine.Connection + The database connection to query in + + Returns + ------- + A dict variable containing {'accession' : 'label'} items. + """ + acc_to_label = {} + + source_name = source_name + "%" + query = select(XrefUORM.accession, XrefUORM.label).where( + XrefUORM.source_id == SourceUORM.source_id, + SourceUORM.name.like(source_name), + XrefUORM.species_id == species_id, + ) + for row in dbi.execute(query).mappings().all(): + acc_to_label[row.accession] = row.label + + return acc_to_label + + def extract_params_from_string(self, string: str, parameters: List[str]) -> List[str]: + values = [] + + for param in parameters: + val = None + + match = re.search(param + r"[=][>](\S+?)[,]", string) + if match: + val = match.group(1) + + values.append(val) + + return values diff --git a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py new file mode 100644 index 000000000..f2e258716 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py @@ -0,0 +1,101 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for CCDS source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class CCDSParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + dba = args["dba"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Extract db connection parameters from file + db_user = "ensro" + db_host, db_port, db_name, db_pass = self.extract_params_from_string( + file, ["host", "port", "dbname", "pass"] + ) + if not db_port: + db_port = "3306" + + # Connect to the appropriate db + if db_host: + ccds_db_url = URL.create( + "mysql", db_user, db_pass, db_host, db_port, db_name + ) + elif dba: + ccds_db_url = dba + + if not ccds_db_url: + return 1, "Could not find CCDS DB." + else: + if verbose: + logging.info(f"Found CCDS DB: {ccds_db_url}") + + # Get data from ccds db + db_engine = self.get_db_engine(ccds_db_url) + with db_engine.connect() as ccds_dbi: + query = ( + select(TranscriptORM.stable_id, XrefCORM.dbprimary_acc) + .where( + XrefCORM.xref_id == ObjectXrefCORM.xref_id, + ObjectXrefCORM.ensembl_object_type == "Transcript", + ObjectXrefCORM.ensembl_id == TranscriptORM.transcript_id, + ExternalDbORM.external_db_id == XrefCORM.external_db_id, + ) + .filter(ExternalDbORM.db_name.like("Ens_%_transcript")) + ) + result = ccds_dbi.execute(query).mappings().all() + + xref_count, direct_count = 0, 0 + seen = {} + + for row in result: + stable_id = row.stable_id + display_label = row.dbprimary_acc + + (acc, version) = display_label.split(".") + + if not seen.get(display_label): + xref_id = self.add_xref( + { + "accession": acc, + "version": version, + "label": display_label, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + + xref_count += 1 + seen[display_label] = xref_id + else: + xref_id = seen[display_label] + + self.add_direct_xref(xref_id, stable_id, "Transcript", "", xref_dbi) + direct_count += 1 + + result_message = f"Parsed CCDS identifiers from {file}, added {xref_count} xrefs and {direct_count} direct_xrefs" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/DBASSParser.py b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py new file mode 100644 index 000000000..9f3f6243a --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py @@ -0,0 +1,114 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for DBASS sources.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +EXPECTED_NUMBER_OF_COLUMNS = 23 + + +class DBASSParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + + if not source_id or not species_id or not xref_file: + raise AttributeError("Need to pass source_id, species_id and file") + + file_io = self.get_filehandle(xref_file) + csv_reader = csv.reader(file_io) + + # Check if header is valid + header = next(csv_reader) + patterns = [r"^id$", r"^genesymbol$", None, r"^ensemblreference$"] + if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise IOError(f"Malformed or unexpected header in DBASS file {xref_file}") + + processed_count = 0 + unmapped_count = 0 + + # Read lines + for line in csv_reader: + if not line: + continue + + if len(line) < EXPECTED_NUMBER_OF_COLUMNS: + line_number = 2 + processed_count + unmapped_count + raise IOError( + f"Line {line_number} of input file {xref_file} has an incorrect number of columns" + ) + + dbass_gene_id = line[0] + dbass_gene_name = line[1] + dbass_full_name = line[2] + ensembl_id = line[3] + + # Do not attempt to create unmapped xrefs. Checking truthiness is good + # enough here because the only non-empty string evaluating as false is + # not a valid Ensembl stable ID. + if ensembl_id: + # DBASS files list synonyms in two ways: either "FOO (BAR)" (with or + # without space) or "FOO/BAR". Both forms are relevant to us. + match = re.search( + r"(.*)\s?/\s?(.*)", dbass_gene_name, re.IGNORECASE | re.DOTALL + ) + if match: + first_gene_name = match.group(1) + second_gene_name = match.group(2) + else: + match = re.search( + r"(.*)\s?\((.*)\)", dbass_gene_name, re.IGNORECASE | re.DOTALL + ) + if match: + first_gene_name = match.group(1) + second_gene_name = match.group(2) + else: + first_gene_name = dbass_gene_name + second_gene_name = None + + label = first_gene_name + synonym = second_gene_name + ensembl_type = "gene" + version = "1" + + xref_id = self.add_xref( + { + "accession": dbass_gene_id, + "version": version, + "label": label, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + + if synonym: + self.add_synonym(xref_id, synonym, xref_dbi) + + self.add_direct_xref(xref_id, ensembl_id, ensembl_type, "", xref_dbi) + + processed_count += 1 + else: + unmapped_count += 1 + + file_io.close() + + result_message = f"{processed_count} direct xrefs successfully processed\n" + result_message += f"Skipped {unmapped_count} unmapped xrefs" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py new file mode 100644 index 000000000..33a7328a2 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py @@ -0,0 +1,120 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for EntrezGene and WikiGene sources.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +EXPECTED_NUMBER_OF_COLUMNS = 16 + + +class EntrezGeneParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi) + if verbose: + logging.info(f"Wiki source id = {wiki_source_id}") + + file_io = self.get_filehandle(file) + csv_reader = csv.reader(file_io, delimiter="\t") + + # Check if header is valid + header = next(csv_reader) + patterns = [ + r"\A[#]?\s*tax_id", + "geneid", + "symbol", + "locustag", + "synonyms", + "dbxrefs", + "chromosome", + "map_location", + "description", + "type_of_gene", + "symbol_from_nomenclature_authority", + "full_name_from_nomenclature_authority", + "nomenclature_status", + "other_designations", + "modification_date", + "feature_type", + ] + if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise IOError(f"Malformed or unexpected header in EntrezGene file {file}") + + xref_count = 0 + syn_count = 0 + seen = {} + + # Read lines + for line in csv_reader: + if not line: + continue + + tax_id = line[0] + acc = line[1] + symbol = line[2] + synonyms = line[4] + desc = line[8] + + if tax_id != species_id: + continue + if seen.get(acc): + continue + + xref_id = self.add_xref( + { + "accession": acc, + "label": symbol, + "description": desc, + "source_id": source_id, + "species_id": species_id, + "info_type": "DEPENDENT", + }, + xref_dbi, + ) + self.add_xref( + { + "accession": acc, + "label": symbol, + "description": desc, + "source_id": wiki_source_id, + "species_id": species_id, + "info_type": "DEPENDENT", + }, + xref_dbi, + ) + + xref_count += 1 + + syns = re.split(r"\|", synonyms) + for synonym in syns: + if synonym != "-": + self.add_synonym(xref_id, synonym, xref_dbi) + syn_count += 1 + + seen[acc] = 1 + + file_io.close() + + result_message = f"{xref_count} EntrezGene Xrefs and {xref_count} WikiGene Xrefs added with {syn_count} synonyms" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py new file mode 100644 index 000000000..9bcda9cbd --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py @@ -0,0 +1,421 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for HGNC source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +from unidecode import unidecode +import codecs + + +class HGNCParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + dba = args["dba"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Parse the file string and set default user + file_params = self.parse_file_string(file) + if not file_params.get("user"): + file_params["user"] = "ensro" + + # Prepare lookup lists + swissprot = self.get_valid_codes("Uniprot/SWISSPROT", species_id, xref_dbi) + refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + source_list = ["refseq_peptide", "refseq_mRNA"] + entrezgene = self.get_valid_xrefs_for_dependencies( + "EntrezGene", source_list, xref_dbi + ) + + # Prepare sources + self_source_name = self.get_source_name_for_source_id(source_id, xref_dbi) + source_ids = { + "ccds": self.get_source_id_for_source_name( + self_source_name, xref_dbi, "ccds" + ), + "entrezgene_manual": self.get_source_id_for_source_name( + self_source_name, xref_dbi, "entrezgene_manual" + ), + "refseq_manual": self.get_source_id_for_source_name( + self_source_name, xref_dbi, "refseq_manual" + ), + "ensembl_manual": self.get_source_id_for_source_name( + self_source_name, xref_dbi, "ensembl_manual" + ), + "desc_only": self.get_source_id_for_source_name( + self_source_name, xref_dbi, "desc_only" + ), + "lrg": self.get_source_id_for_source_name("LRG_HGNC_notransfer", xref_dbi), + "genecards": self.get_source_id_for_source_name("GeneCards", xref_dbi), + } + + # Statistics counts + name_count = { + "ccds": 0, + "lrg": 0, + "ensembl_manual": 0, + "genecards": 0, + "refseq_manual": 0, + "entrezgene_manual": 0, + } + mismatch = 0 + + # Connect to the ccds db + ccds_db_url = None + if dba: + ccds_db_url = dba + elif file_params.get("host"): + ccds_db_url = URL.create( + "mysql", + file_params["user"], + file_params["pass"], + file_params["host"], + file_params["port"], + file_params["dbname"], + ) + else: + raise AttributeError("No ensembl ccds database provided") + + if not ccds_db_url: + raise AttributeError("No ensembl ccds database provided") + else: + if verbose: + logging.info(f"Found ccds DB: {ccds_db_url}") + + # Get CCDS data + db_engine = self.get_db_engine(ccds_db_url) + with db_engine.connect() as ccds_dbi: + query = ( + select(TranscriptAttribORM.value, TranscriptORM.stable_id) + .join( + TranscriptAttribORM, + TranscriptORM.transcript_id == TranscriptAttribORM.transcript_id, + ) + .join( + AttribTypeORM, + TranscriptAttribORM.attrib_type_id == AttribTypeORM.attrib_type_id, + ) + .where(AttribTypeORM.code == "ccds_transcript") + ) + result = ccds_dbi.execute(query).mappings().all() + + ccds_to_ens = {} + for row in result: + # Remove version + ccds_id = re.sub(r"\.\d+", "", row.value) + + ccds_to_ens[ccds_id] = row.stable_id + + # Get HGNC file (wget or disk) + mem_file = file + if file_params.get("wget"): + response = requests.get(file_params["wget"]) + if not response.ok: + raise IOError(response.reason) + mem_file = response.text + + # Make sure the file is utf8 + mem_file = codecs.encode(mem_file, "utf-8").decode("utf-8") + mem_file = re.sub(r'"', '', mem_file) + + file_io = self.get_filehandle(mem_file) + csv_reader = csv.DictReader(file_io, delimiter="\t") + + # Read lines + for line in csv_reader: + accession = line["HGNC ID"] + symbol = line["Approved symbol"] + name = line["Approved name"] + previous_symbols = line["Previous symbols"] + synonyms = line["Alias symbols"] + + seen = 0 + + # Direct CCDS to ENST mappings + ccds = line["CCDS IDs"] + ccds_list = [] + if ccds: + ccds_list = re.split(r",\s", ccds) + + for ccds in ccds_list: + enst_id = ccds_to_ens.get(ccds) + if not enst_id: + continue + + self.add_to_direct_xrefs( + { + "stable_id": enst_id, + "ensembl_type": "gene", + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids["ccds"], + "species_id": species_id, + }, + xref_dbi, + ) + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["ccds"], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + name_count["ccds"] += 1 + + # Direct LRG to ENST mappings + lrg_id = line["Locus specific databases"] + if lrg_id: + match = re.search(r"(LRG_\d+)\|", lrg_id) + if match: + lrg_id = match.group(1) + + self.add_to_direct_xrefs( + { + "stable_id": lrg_id, + "ensembl_type": "gene", + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids["lrg"], + "species_id": species_id, + }, + xref_dbi, + ) + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["lrg"], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + name_count["lrg"] += 1 + + # Direct Ensembl mappings + ensg_id = line["Ensembl gene ID"] + if ensg_id: + seen = 1 + + self.add_to_direct_xrefs( + { + "stable_id": ensg_id, + "ensembl_type": "gene", + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids["ensembl_manual"], + "species_id": species_id, + }, + xref_dbi, + ) + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["ensembl_manual"], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + name_count["ensembl_manual"] += 1 + + # GeneCards + direct_id = self.get_xref_id( + accession, source_ids["ensembl_manual"], species_id, xref_dbi + ) + hgnc_id = re.search(r"HGNC:(\d+)", accession).group(1) + + self.add_dependent_xref( + { + "master_xref_id": direct_id, + "accession": hgnc_id, + "label": symbol, + "description": name, + "source_id": source_ids["genecards"], + "species_id": species_id, + }, + xref_dbi, + ) + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["genecards"], + "name": hgnc_id, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + name_count["genecards"] += 1 + + # RefSeq + refseq_id = line["RefSeq IDs"] + if refseq_id and refseq.get(refseq_id): + seen = 1 + + for xref_id in refseq[refseq_id]: + self.add_dependent_xref( + { + "master_xref_id": xref_id, + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids["refseq_manual"], + "species_id": species_id, + }, + xref_dbi, + ) + name_count["refseq_manual"] += 1 + + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["refseq_manual"], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + # EntrezGene + entrez_id = line["NCBI Gene ID"] + if entrez_id and entrezgene.get(entrez_id): + seen = 1 + + self.add_dependent_xref( + { + "master_xref_id": entrezgene[entrez_id], + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids["entrezgene_manual"], + "species_id": species_id, + }, + xref_dbi, + ) + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["entrezgene_manual"], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + name_count["entrezgene_manual"] += 1 + + # Store to keep descriptions if not stored yet + if not seen: + xref_id = self.add_xref( + { + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids["desc_only"], + "species_id": species_id, + "info_type": "MISC", + }, + xref_dbi, + ) + self.add_synonyms_for_hgnc( + { + "source_id": source_ids["desc_only"], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + mismatch += 1 + + file_io.close() + + result_message = "HGNC xrefs loaded:\n" + for count_type, count in name_count.items(): + result_message += f"\t{count_type}\t{count}\n" + result_message += f"{mismatch} HGNC ids could not be associated in xrefs" + + return 0, result_message + + def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None: + source_id = args["source_id"] + name = args["name"] + species_id = args["species_id"] + dead_string = args.get("dead") + alias_string = args.get("alias") + + # Dead name, add to synonym + if dead_string: + dead_string = re.sub('"', "", dead_string) + dead_array = re.split(r",\s", dead_string) + + for dead in dead_array: + try: + dead = dead.decode("utf-8") + except: + pass + dead = unidecode(dead.upper()) + + self.add_to_syn(name, source_id, dead, species_id, dbi) + + # Alias name, add to synonym + if alias_string: + alias_string = re.sub('"', "", alias_string) + alias_array = re.split(r",\s", alias_string) + + for alias in alias_array: + try: + alias = alias.decode("utf-8") + except: + pass + alias = unidecode(alias.upper()) + + self.add_to_syn(name, source_id, alias, species_id, dbi) + + def parse_file_string(self, file_string: str) -> Dict[str, str]: + # file_string = re.sub(r"\A\w+:", "", file_string) + file_string = re.sub(r"^\w+:", "", file_string) + + param_pairs = file_string.split(",") + params = {} + + # Set provided values + for pair in param_pairs: + if re.search("=>", pair): + key, value = pair.split("=>") + params[key] = value + + return params diff --git a/src/python/ensembl/production/xrefs/parsers/HPAParser.py b/src/python/ensembl/production/xrefs/parsers/HPAParser.py new file mode 100644 index 000000000..76c99d769 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/HPAParser.py @@ -0,0 +1,74 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for HPA source.""" + +import csv + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser + +EXPECTED_NUMBER_OF_COLUMNS = 4 + + +class HPAParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + file_io = self.get_filehandle(file) + csv_reader = csv.reader(file_io, delimiter=",", strict=True) + + # Check if header is valid + header = next(csv_reader) + patterns = ["antibody", "antibody_id", "ensembl_peptide_id", "link"] + if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise IOError(f"Malformed or unexpected header in HPA file {file}") + + parsed_count = 0 + + # Read lines + for line in csv_reader: + if not line: + continue + + antibody_name = line[0] + antibody_id = line[1] + ensembl_id = line[2] + + self.add_to_direct_xrefs( + { + "accession": antibody_id, + "version": "1", + "label": antibody_name, + "stable_id": ensembl_id, + "ensembl_type": "translation", + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + + parsed_count += 1 + + file_io.close() + + result_message = f"{parsed_count} direct xrefs succesfully parsed" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py new file mode 100644 index 000000000..8ce883d1d --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py @@ -0,0 +1,60 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for JGI source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +from Bio import SeqIO + + +class JGI_ProteinParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + xrefs = [] + + file_io = self.get_filehandle(file) + fasta_sequences = SeqIO.parse(file_io, "fasta") + + for fasta in fasta_sequences: + accession = fasta.id + sequence = fasta.seq + + # Extract accession value + accession = re.search(r"^ci0100(\w+?)$", accession).group(1) + + # Build an xref object and store it + xref = { + "ACCESSION": accession, + "SEQUENCE": sequence, + "SOURCE_ID": source_id, + "SPECIES_ID": species_id, + "SEQUENCE_TYPE": "peptide", + } + xrefs.append(xref) + + file_io.close() + + self.upload_xref_object_graphs(xrefs, xref_dbi) + + result_message = "%d JGI_ xrefs succesfully parsed" % len(xrefs) + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MGIParser.py b/src/python/ensembl/production/xrefs/parsers/MGIParser.py new file mode 100644 index 000000000..2508d516a --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/MGIParser.py @@ -0,0 +1,72 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for MGI source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class MGIParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + syn_hash = self.get_ext_synonyms("MGI", xref_dbi) + + file_io = self.get_filehandle(file) + csv_reader = csv.reader(file_io, delimiter="\t", strict=True) + + count = 0 + syn_count = 0 + + # Read lines + for line in csv_reader: + if not line: + continue + + accession = line[0] + ensembl_id = line[5] + + xref_id = self.add_xref( + { + "accession": accession, + "version": 0, + "label": line[1], + "description": line[2], + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, ensembl_id, "Gene", "", xref_dbi) + + if syn_hash.get(accession): + for synonym in syn_hash[accession]: + self.add_synonym(xref_id, synonym, xref_dbi) + syn_count += 1 + + count += 1 + + file_io.close() + + result_message = f"{count} direct MGI xrefs added\n" + result_message += f"{syn_count} synonyms added" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py new file mode 100644 index 000000000..ae1fbb3dd --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py @@ -0,0 +1,107 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for MGI CCDS source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class MGI_CCDS_Parser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + source_ids = [] + labels = {} + versions = {} + descriptions = {} + accessions = {} + + query = select(SourceUORM.source_id).filter(SourceUORM.name.like("MGI")) + result = xref_dbi.execute(query).fetchall() + for row in result: + source_ids.append(row[0]) + + query = select( + XrefUORM.accession, XrefUORM.label, XrefUORM.version, XrefUORM.description + ).filter(XrefUORM.source_id.in_(source_ids)) + + for row in xref_dbi.execute(query).mappings().all(): + if row["description"]: + accessions[row["label"]] = row.accession + labels[row["accession"]] = row.label + versions[row["accession"]] = row.version + descriptions[row["accession"]] = row.description + + # Get master xref ids via the ccds label + ccds_label_to_xref_id = {} + query = select(XrefUORM.label, XrefUORM.xref_id).where( + XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name == "CCDS" + ) + result = xref_dbi.execute(query).fetchall() + for row in result: + ccds_label_to_xref_id[row[0]] = row[1] + + count = 0 + ccds_missing = 0 + mgi_missing = 0 + + mgi_io = self.get_filehandle(file) + for line in mgi_io: + line = line.rstrip() + if not line: + continue + + fields = line.split("\t") + chromosome = fields[0] + g_accession = fields[1] + gene_name = fields[2] + entrez_id = fields[3] + ccds = fields[4] + + if ccds_label_to_xref_id.get(ccds): + if accessions.get(gene_name) and labels.get(accessions[gene_name]): + accession = accessions[gene_name] + self.add_dependent_xref( + { + "master_xref_id": ccds_label_to_xref_id[ccds], + "accession": accession, + "version": versions[accession], + "label": labels[accession], + "description": descriptions[accession], + "source_id": source_id, + "species_id": species_id, + }, + xref_dbi, + ) + + count += 1 + else: + mgi_missing += 1 + else: + ccds_missing += 1 + + mgi_io.close() + + result_message = f"Added {count} MGI xrefs via CCDS\n" + result_message += ( + f"{ccds_missing} CCDS not resolved, {mgi_missing} MGI not found" + ) + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py new file mode 100644 index 000000000..010298200 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py @@ -0,0 +1,101 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for MGI Descriptions.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +EXPECTED_NUMBER_OF_COLUMNS = 12 + + +class MGI_Desc_Parser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + file_io = self.get_filehandle(file) + csv_reader = csv.reader( + file_io, delimiter="\t", strict=True, quotechar=None, escapechar=None + ) + + # Check if header is valid + header = next(csv_reader) + patterns = [ + "mgi accession id", + "chr", + "cm position", + "genome coordinate start", + "genome coordinate end", + "strand", + "marker symbol", + "status", + "marker name", + "marker type", + "feature type", + r"marker\ssynonyms\s\(pipe\-separated\)", + ] + if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise IOError(f"Malformed or unexpected header in MGI_desc file {file}") + + xref_count = 0 + syn_count = 0 + acc_to_xref = {} + + # Read lines + for line in csv_reader: + if not line: + continue + + accession = line[0] + marker = line[8] + + xref_id = self.add_xref( + { + "accession": accession, + "label": line[6], + "description": marker, + "source_id": source_id, + "species_id": species_id, + "info_type": "MISC", + }, + xref_dbi, + ) + acc_to_xref[accession] = xref_id + + if not marker and verbose: + logging.info(f"{accession} has no description") + + xref_count += 1 + + if acc_to_xref.get(accession): + synonym_field = line[11] + if synonym_field: + synonyms = re.split(r"[|]", synonym_field) + + for synonym in synonyms: + self.add_synonym(xref_id, synonym, xref_dbi) + syn_count += 1 + + file_io.close() + + result_message = f"{xref_count} MGI Description Xrefs added\n" + result_message += f"{syn_count} synonyms added" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MIMParser.py b/src/python/ensembl/production/xrefs/parsers/MIMParser.py new file mode 100644 index 000000000..1ae4f5952 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/MIMParser.py @@ -0,0 +1,159 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for MIM source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class MIMParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + general_source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not general_source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + old_to_new, removed = {}, {} + sources = [] + + sources.append(general_source_id) + + gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi) + sources.append(gene_source_id) + morbid_source_id = self.get_source_id_for_source_name("MIM_MORBID", xref_dbi) + sources.append(morbid_source_id) + + TYPE_SINGLE_SOURCES = { + "*": gene_source_id, + "": morbid_source_id, + "#": morbid_source_id, + "%": morbid_source_id, + } + + counters = {gene_source_id: 0, morbid_source_id: 0, "removed": 0, "synonyms": 0} + + if verbose: + logging.info("Sources are: " + ", ".join(map(str, sources))) + + for section in self.get_file_sections(file, "*RECORD*"): + if len(section) == 1: + continue + + record = "".join(section) + + # Extract the TI field + ti = self.extract_ti(record) + if not ti: + raise IOError("Failed to extract TI field from record") + + # Extract record type + (record_type, number, long_desc) = self.parse_ti(ti) + if record_type is None: + raise IOError( + "Failed to extract record type and description from TI field" + ) + + # Use the first block of text as description + fields = re.split(";;", long_desc, flags=re.MULTILINE | re.DOTALL) + label = fields[0] + label = f"{label} [{record_type}{number}]" + + xref_object = { + "accession": number, + "label": label, + "description": long_desc, + "species_id": species_id, + "info_type": "UNMAPPED", + } + + if TYPE_SINGLE_SOURCES.get(record_type): + type_source = TYPE_SINGLE_SOURCES[record_type] + xref_object["source_id"] = type_source + counters[type_source] += 1 + + xref_id = self.add_xref(xref_object, xref_dbi) + elif record_type == "+": + # This type means both gene and phenotype, add both + xref_object["source_id"] = gene_source_id + counters[gene_source_id] += 1 + xref_id = self.add_xref(xref_object, xref_dbi) + + xref_object["source_id"] = morbid_source_id + counters[morbid_source_id] += 1 + xref_id = self.add_xref(xref_object, xref_dbi) + elif record_type == "^": + match = re.search( + r"MOVED\sTO\s(\d+)", long_desc, flags=re.MULTILINE | re.DOTALL + ) + if match: + new_number = match.group(1) + if new_number != number: + old_to_new[number] = new_number + elif long_desc == "REMOVED FROM DATABASE": + removed[number] = 1 + counters["removed"] += 1 + else: + raise IOError(f"Unsupported type of a '^' record: '{long_desc}'") + + # Generate synonyms from "MOVED TO" entries + for old, new in old_to_new.items(): + # Some entries in the MIM database have been moved multiple times + # Keep traversing the chain of renames until we have reached the end + while old_to_new.get(new): + new = old_to_new[new] + + # Check if the entry has been removed from the database + if not removed.get(new): + self.add_to_syn_for_mult_sources( + new, sources, old, species_id, xref_dbi + ) + counters["synonyms"] += 1 + + result_message = "%d genemap and %d phenotype MIM xrefs added\n" % ( + counters[gene_source_id], + counters[morbid_source_id], + ) + result_message += ( + "\t%d synonyms (defined by MOVED TO) added\n" % counters["synonyms"] + ) + result_message += "\t%d entries removed" % counters["removed"] + + return 0, result_message + + def extract_ti(self, input_record: str) -> str: + ti = None + + match = re.search( + r"[*]FIELD[*]\sTI\n(.+?)\n?(?:[*]FIELD[*]| [*]RECORD[*]| [*]THEEND[*])", + input_record, + flags=re.MULTILINE | re.DOTALL, + ) + if match: + ti = match.group(1) + + return ti + + def parse_ti(self, ti: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: + ti = re.sub(r"(?:;;\n|\n;;)", ";;", ti, flags=re.MULTILINE | re.DOTALL) + ti = re.sub(r"\n", "", ti, flags=re.MULTILINE | re.DOTALL) + + match = re.search(r"\A([#%+*^]*)(\d+)\s+(.+)", ti) + if match: + return match.group(1), match.group(2), match.group(3) + + return None, None, None diff --git a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py new file mode 100644 index 000000000..6c7688889 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py @@ -0,0 +1,170 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for MIM to Gene source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +EXPECTED_NUMBER_OF_COLUMNS = 6 + + +class Mim2GeneParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + general_source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not general_source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Get needed source IDs + mim_gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi) + mim_morbid_source_id = self.get_source_id_for_source_name( + "MIM_MORBID", xref_dbi + ) + entrez_source_id = self.get_source_id_for_source_name("EntrezGene", xref_dbi) + + # This will be used to prevent insertion of duplicates + self.build_dependent_mappings(mim_gene_source_id, xref_dbi) + self.build_dependent_mappings(mim_morbid_source_id, xref_dbi) + + mim_gene = self.get_valid_codes("MIM_GENE", species_id, xref_dbi) + mim_morbid = self.get_valid_codes("MIM_MORBID", species_id, xref_dbi) + entrez = self.get_valid_codes("EntrezGene", species_id, xref_dbi) + + counters = { + "all_entries": 0, + "dependent_on_entrez": 0, + "missed_master": 0, + "missed_omim": 0, + } + + file_io = self.get_filehandle(file) + csv_reader = csv.reader(file_io, delimiter="\t") + + # Read lines + for line in csv_reader: + if not line: + continue + + # Extract the header from among the comments + match = re.search(r"\A([#])?", line[0]) + if match: + is_comment = match.group(1) + if is_comment: + patterns = [ + r"\A[#]?\s*MIM[ ]number", + "GeneID", + "type", + "Source", + "MedGenCUI", + "Comment", + ] + if len( + line + ) == EXPECTED_NUMBER_OF_COLUMNS and not self.is_file_header_valid( + EXPECTED_NUMBER_OF_COLUMNS, patterns, line, True + ): + raise IOError( + f"Malformed or unexpected header in Mim2Gene file {file}" + ) + continue + + if len(line) != EXPECTED_NUMBER_OF_COLUMNS: + raise IOError( + f"Line {csv_reader.line_num} of input file {file} has an incorrect number of columns" + ) + + fields = [re.sub(r"\s+\Z", "", x) for x in line] + omim_acc = fields[0] + entrez_id = fields[1] + type = fields[2] + source = fields[3] + medgen = fields[4] + comment = fields[5] + + counters["all_entries"] += 1 + + # No point in doing anything if we have no matching MIM xref ... + if omim_acc not in mim_gene and omim_acc not in mim_morbid: + counters["missed_omim"] += 1 + continue + + # ...or no EntrezGene xref to match it to + if not entrez_id or entrez_id not in entrez: + counters["missed_master"] += 1 + continue + + # Check if type is known + if verbose and type not in [ + "gene", + "gene/phenotype", + "predominantly phenotypes", + "phenotype", + ]: + logging.warn( + f"Unknown type {type} for MIM Number {omim_acc} ({file}:{csv_reader.line_num})" + ) + + # With all the checks taken care of, insert the mappings. We check + # both MIM_GENE and MIM_MORBID every time because some MIM entries + # can appear in both. + if omim_acc in mim_gene: + for mim_xref_id in mim_gene[omim_acc]: + counters["dependent_on_entrez"] += self.process_xref_entry( + { + "mim_xref_id": mim_xref_id, + "mim_source_id": mim_gene_source_id, + "entrez_xrefs": entrez[entrez_id], + "entrez_source_id": entrez_source_id, + }, + xref_dbi, + ) + if omim_acc in mim_morbid: + for mim_xref_id in mim_morbid[omim_acc]: + counters["dependent_on_entrez"] += self.process_xref_entry( + { + "mim_xref_id": mim_xref_id, + "mim_source_id": mim_morbid_source_id, + "entrez_xrefs": entrez[entrez_id], + "entrez_source_id": entrez_source_id, + }, + xref_dbi, + ) + + file_io.close() + + result_message = ( + "Processed %d entries. Out of those\n" % counters["all_entries"] + ) + result_message += "\t%d had missing OMIM entries,\n" % counters["missed_omim"] + result_message += ( + "\t%d were dependent EntrezGene xrefs,\n" % counters["dependent_on_entrez"] + ) + result_message += "\t%d had missing master entries." % counters["missed_master"] + + return 0, result_message + + def process_xref_entry(self, args: Dict[str, Any], dbi: Connection) -> int: + count = 0 + + for ent_id in args["entrez_xrefs"]: + self.add_dependent_xref_maponly( + args["mim_xref_id"], args["mim_source_id"], ent_id, None, dbi, True + ) + count += 1 + + return count diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py new file mode 100644 index 000000000..c7d4990eb --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py @@ -0,0 +1,193 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for RFAM source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class RFAMParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + species_name = args["species_name"] + file = args["file"] + dba = args["dba"] + ensembl_release = args["ensembl_release"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Extract db connection parameters from file + wget_url, db_user, db_host, db_port, db_name, db_pass = ( + self.extract_params_from_string( + file, ["wget", "user", "host", "port", "dbname", "pass"] + ) + ) + if not db_user: + db_user = "ensro" + if not db_port: + db_port = "3306" + + # Get the species name(s) + species_id_to_names = self.species_id_to_names(xref_dbi) + if species_name: + species_id_to_names.setdefault(species_id, []).append(species_name) + + if not species_id_to_names.get(species_id): + return 0, "Skipped. Could not find species ID to name mapping" + + species_name = species_id_to_names[species_id][0] + + # Connect to the appropriate rfam db + if db_host: + rfam_db_url = URL.create( + "mysql", db_user, db_pass, db_host, db_port, db_name + ) + elif dba: + rfam_db_url = dba + else: + if verbose: + logging.info("Looking for db in mysql-ens-sta-1") + registry = "ensro@mysql-ens-sta-1:4519" + rfam_db_url = self.get_db_from_registry( + species_name, "core", ensembl_release, registry + ) + + if not rfam_db_url: + raise IOError(f"Could not find RFAM DB.") + else: + if verbose: + logging.info(f"Found RFAM DB: {rfam_db_url}") + + # Get data from rfam db + db_engine = self.get_db_engine(rfam_db_url) + with db_engine.connect() as rfam_dbi: + query = ( + select( + TranscriptORM.stable_id.distinct(), + DnaAlignFeatureORM.hit_name, + AnalysisORM.analysis_id, + ) + .join( + TranscriptORM, + and_( + TranscriptORM.analysis_id == AnalysisORM.analysis_id, + AnalysisORM.logic_name.like("ncrna%"), + TranscriptORM.biotype != "miRNA", + ), + ) + .join( + ExonTranscriptORM, + ExonTranscriptORM.transcript_id == TranscriptORM.transcript_id, + ) + .join( + SupportingFeatureORM, + and_( + SupportingFeatureORM.exon_id == ExonTranscriptORM.exon_id, + SupportingFeatureORM.feature_type == "dna_align_feature", + ), + ) + .join( + DnaAlignFeatureORM, + DnaAlignFeatureORM.dna_align_feature_id + == SupportingFeatureORM.feature_id, + ) + .order_by(DnaAlignFeatureORM.hit_name) + ) + result = rfam_dbi.execute(query).mappings().all() + + # Create a dict with RFAM accessions as keys and value is an array of ensembl transcript stable_ids + rfam_transcript_stable_ids = {} + for row in result: + rfam_id = None + + match = re.search(r"^(RF\d+)", row.hit_name) + if match: + rfam_id = match.group(1) + + if rfam_id: + rfam_transcript_stable_ids.setdefault(rfam_id, []).append(row.stable_id) + + # Download file through wget if url present + if wget_url: + uri = urlparse(wget_url) + file = os.path.join(os.path.dirname(file), os.path.basename(uri.path)) + wget.download(wget_url, file) + + # Read data from file + lines = [] + entry = "" + + file_io = gzip.open(file, "r") + for line in file_io: + line = line.decode("latin-1") + if re.search(r"^//", line): + lines.append(entry) + entry = "" + elif ( + re.search(r"^#=GF\sAC", line) + or re.search(r"^#=GF\sID", line) + or re.search(r"^#=GF\sDE", line) + ): + entry += line + file_io.close() + + # Add xrefs + xref_count, direct_count = 0, 0 + + for entry in lines: + accession, label, description = None, None, None + + # Extract data from entry + match = re.search(r"^#=GF\sAC\s+(\w+)", entry, flags=re.MULTILINE) + if match: + accession = match.group(1) + match = re.search(r"^#=GF\sID\s+([^\n]+)", entry, flags=re.MULTILINE) + if match: + label = match.group(1) + match = re.search(r"^#=GF\sDE\s+([^\n]+)", entry, flags=re.MULTILINE) + if match: + description = match.group(1) + + if accession: + if rfam_transcript_stable_ids.get(accession): + xref_id = self.add_xref( + { + "accession": accession, + "version": 0, + "label": label or accession, + "description": description, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + xref_count += 1 + + transcript_stable_ids = rfam_transcript_stable_ids[accession] + for stable_id in transcript_stable_ids: + self.add_direct_xref( + xref_id, stable_id, "Transcript", "", xref_dbi + ) + direct_count += 1 + + result_message = ( + f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs" + ) + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/RGDParser.py b/src/python/ensembl/production/xrefs/parsers/RGDParser.py new file mode 100644 index 000000000..11ddd0e0e --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/RGDParser.py @@ -0,0 +1,154 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for RGD source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class RGDParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + direct_source_id = self.get_source_id_for_source_name( + "RGD", xref_dbi, "direct_xref" + ) + + # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs + preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + + rgd_io = self.get_filehandle(file) + csv_reader = csv.DictReader( + filter(lambda row: row[0] != "#", rgd_io), delimiter="\t" + ) + + header_found, count, ensembl_count, mismatch, syn_count = 0, 0, 0, 0, 0 + columns = {} + + # Read lines + for line in csv_reader: + # Don't bother doing anything if we don't have an RGD ID + if not line.get("GENE_RGD_ID") or not line["GENE_RGD_ID"]: + continue + + # Some RGD annotation is directly copied from Ensembl + if re.search("ENSRNO", line["SYMBOL"]): + continue + + genbank_nucleotides = [] + if line.get("GENBANK_NUCLEOTIDE"): + genbank_nucleotides = line["GENBANK_NUCLEOTIDE"].split(";") + + done = 0 + # The nucleotides are sorted in the file in alphabetical order. Filter them down + # to a higher quality subset, then add dependent Xrefs where possible + for nucleotide in self.sort_refseq_accessions(genbank_nucleotides): + if not done and preloaded_refseq.get(nucleotide): + for xref in preloaded_refseq[nucleotide]: + xref_id = self.add_dependent_xref( + { + "master_xref_id": xref, + "accession": line["GENE_RGD_ID"], + "label": line["SYMBOL"], + "description": line["NAME"], + "source_id": source_id, + "species_id": species_id, + }, + xref_dbi, + ) + + count += 1 + syn_count += self.process_synonyms( + xref_id, line["OLD_SYMBOL"], xref_dbi + ) + done = 1 + + # Add direct xrefs + if line.get("ENSEMBL_ID"): + ensembl_ids = line["ENSEMBL_ID"].split(";") + + for id in ensembl_ids: + self.add_to_direct_xrefs( + { + "stable_id": id, + "ensembl_type": "gene", + "accession": line["GENE_RGD_ID"], + "label": line["SYMBOL"], + "description": line["NAME"], + "source_id": direct_source_id, + "species_id": species_id, + }, + xref_dbi, + ) + xref_id = self.get_xref_id( + line["GENE_RGD_ID"], direct_source_id, species_id, xref_dbi + ) + + ensembl_count += 1 + syn_count += self.process_synonyms( + xref_id, line["OLD_SYMBOL"], xref_dbi + ) + done = 1 + + # If neither direct or dependent, add misc xref + if not done: + xref_id = self.add_xref( + { + "accession": line["GENE_RGD_ID"], + "label": line["SYMBOL"], + "description": line["NAME"], + "source_id": source_id, + "species_id": species_id, + "info_type": "MISC", + }, + xref_dbi, + ) + mismatch += 1 + + rgd_io.close() + + result_message = f"{count} xrefs succesfully loaded and dependent on refseq\n" + result_message += f"\t{mismatch} xrefs added but with NO dependencies\n" + result_message += f"\t{ensembl_count} direct xrefs successfully loaded\n" + result_message += f"\tAdded {syn_count} synonyms, including duplicates" + + return 0, result_message + + def sort_refseq_accessions(self, accessions: List[str]) -> List[str]: + refseq_priorities = {"NM": 1, "NP": 1, "NR": 1, "XM": 2, "XP": 2, "XR": 2} + + accessions = sorted( + [x for x in accessions if x[:2] in refseq_priorities], + key=lambda x: (refseq_priorities[x[:2]], x), + ) + return accessions + + def process_synonyms(self, xref_id: int, synonym_string: str, dbi: Connection) -> int: + syn_count = 0 + + if not synonym_string or not xref_id: + return syn_count + + synonyms = synonym_string.split(";") + for synonym in synonyms: + self.add_synonym(xref_id, synonym, dbi) + syn_count += 1 + + return syn_count diff --git a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py new file mode 100644 index 000000000..4ae9b46d8 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py @@ -0,0 +1,189 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for Reactome source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class ReactomeParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + species_name = args["species_name"] + file = args["file"] + release_file = args["rel_file"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Parse release file + if release_file: + release = None + + release_io = self.get_filehandle(release_file) + for line in release_io: + match = re.search(r"([0-9]*)", line) + if match: + release = match.group(1) + if verbose: + logging.info(f"Reactome release is '{release}'") + release_io.close() + + if not release: + raise IOError(f"Could not find release using {release_file}") + + self.set_release(source_id, release, xref_dbi) + + # Create a hash of all valid names for this species + species_to_alias = self.species_id_to_names(xref_dbi) + if species_name: + species_to_alias.setdefault(species_id, []).append(species_name) + + if not species_to_alias.get(species_id): + return 0, "Skipped. Could not find species ID to name mapping" + + aliases = species_to_alias[species_id] + alias_to_species_id = {alias: 1 for alias in aliases} + + # Get relevant source ids + reactome_source_id = self.get_source_id_for_source_name( + "reactome", xref_dbi, "direct" + ) + transcript_reactome_source_id = self.get_source_id_for_source_name( + "reactome_transcript", xref_dbi + ) + gene_reactome_source_id = self.get_source_id_for_source_name( + "reactome_gene", xref_dbi + ) + reactome_uniprot_source_id = self.get_source_id_for_source_name( + "reactome", xref_dbi, "uniprot" + ) + + # Cannot continue unless source ids are found + if ( + not reactome_source_id + or not transcript_reactome_source_id + or not gene_reactome_source_id + ): + raise KeyError("Could not find source id for reactome sources") + else: + if verbose: + logging.info(f"Source_id = {reactome_source_id}") + logging.info(f"Transcript_source_id = {transcript_reactome_source_id}") + logging.info(f"Gene_source_id = {gene_reactome_source_id}") + + if not reactome_uniprot_source_id: + raise KeyError("Could not find source id for reactome uniprot") + else: + if verbose: + logging.info(f"Uniprot_source_id = {reactome_uniprot_source_id}") + + # Get uniprot accessions + is_uniprot = 0 + uniprot_accessions = {} + if re.search("UniProt", file): + is_uniprot = 1 + uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi) + + parsed_count, err_count = 0, 0 + + # Read file + reactome_io = self.get_filehandle(file) + + for line in reactome_io: + line = line.strip() + + (ensembl_stable_id, reactome_id, url, description, evidence, species) = ( + re.split(r"\t+", line) + ) + + # Check description pattern + match = re.search( + r"^[A-Za-z0-9_,\(\)\/\-\.:\+'&;\"\/\?%>\s\[\]]+$", description + ) + if not match: + continue + + species = re.sub(r"\s", "_", species) + species = species.lower() + + current_source_id = reactome_source_id + + if alias_to_species_id.get(species): + parsed_count += 1 + + ensembl_type = None + info_type = "DIRECT" + + # Add uniprot dependent xrefs + if is_uniprot: + if uniprot_accessions.get(ensembl_stable_id): + for xref in uniprot_accessions[ensembl_stable_id]: + xref_id = self.add_dependent_xref( + { + "master_xref_id": xref, + "accession": reactome_id, + "label": reactome_id, + "description": description, + "source_id": reactome_uniprot_source_id, + "species_id": species_id, + }, + xref_dbi, + ) + info_type = "DEPENDENT" + + # Attempt to guess the object_type based on the stable id + elif re.search(r"G[0-9]*$", ensembl_stable_id): + ensembl_type = "gene" + current_source_id = gene_reactome_source_id + elif re.search(r"T[0-9]*$", ensembl_stable_id): + ensembl_type = "transcript" + current_source_id = transcript_reactome_source_id + elif re.search(r"P[0-9]*$", ensembl_stable_id): + ensembl_type = "translation" + + # Is not in Uniprot and does not match Ensembl stable id format + else: + if verbose: + logging.debug(f"Could not find type for {ensembl_stable_id}") + err_count += 1 + continue + + # Add new entry for reactome xref as well as direct xref to ensembl stable id + xref_id = self.add_xref( + { + "accession": reactome_id, + "label": reactome_id, + "description": description, + "source_id": current_source_id, + "species_id": species_id, + "info_type": info_type, + }, + xref_dbi, + ) + + if ensembl_type: + self.add_direct_xref( + xref_id, ensembl_stable_id, ensembl_type, "", xref_dbi + ) + + reactome_io.close() + + result_message = f"{parsed_count} entries processed\n" + result_message += f"{err_count} not found" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py new file mode 100644 index 000000000..14f6f76dd --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py @@ -0,0 +1,96 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for RefSeq coordinate xrefs.""" + +from ensembl.production.xrefs.parsers.BaseParser import * +from ensembl.common.RangeRegistry import RangeRegistry + + +class RefSeqCoordinateParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + species_name = args["species_name"] + file = args["file"] + dba = args["dba"] + ensembl_release = args["ensembl_release"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + source_ids = { + "peptide": self.get_source_id_for_source_name( + "RefSeq_peptide", xref_dbi, "otherfeatures" + ), + "mrna": self.get_source_id_for_source_name( + "RefSeq_mRNA", xref_dbi, "otherfeatures" + ), + "ncrna": self.get_source_id_for_source_name( + "RefSeq_ncRNA", xref_dbi, "otherfeatures" + ), + "peptide_predicted": self.get_source_id_for_source_name( + "RefSeq_peptide_predicted", xref_dbi, "otherfeatures" + ), + "mrna_predicted": self.get_source_id_for_source_name( + "RefSeq_mRNA_predicted", xref_dbi, "otherfeatures" + ), + "ncrna_predicted": self.get_source_id_for_source_name( + "RefSeq_ncRNA_predicted", xref_dbi, "otherfeatures" + ), + "entrezgene": self.get_source_id_for_source_name("EntrezGene", xref_dbi), + "wikigene": self.get_source_id_for_source_name("WikiGene", xref_dbi), + } + + if verbose: + logging.info(f'RefSeq_peptide source ID = {source_ids["peptide"]}') + logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna"]}') + logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna"]}') + logging.info( + f'RefSeq_peptide_predicted source ID = {source_ids["peptide_predicted"]}' + ) + logging.info( + f'RefSeq_mRNA_predicted source ID = {source_ids["mrna_predicted"]}' + ) + logging.info( + f'RefSeq_ncRNA_predicted source ID = {source_ids["ncrna_predicted"]}' + ) + + # Get the species name(s) + species_id_to_names = self.species_id_to_names(xref_dbi) + if species_name: + species_id_to_names.setdefault(species_id, []).append(species_name) + + if not species_id_to_names.get(species_id): + return 0, "Skipped. Could not find species ID to name mapping." + species_name = species_id_to_names[species_id][0] + + # Connect to the appropriate dbs + if dba: + scripts_dir = args["perl_scripts_dir"] + xref_db_url = args["xref_db_url"] + source_ids_json = json.dumps(source_ids) + + logging.info( + f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl" + ) + perl_cmd = f"perl {scripts_dir}/refseq_coordinate_parser.pl --xref_db_url '{xref_db_url}' --core_db_url '{args['core_db_url']}' --otherf_db_url '{dba}' --source_ids '{source_ids_json}' --species_id {species_id} --species_name {species_name} --release {ensembl_release}" + cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE) + + return 0, "Added refseq_import xrefs." + else: + # Not all species have an otherfeatures database, skip if not found + return 0, f"Skipped. No otherfeatures database for '{species_name}'." diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py new file mode 100644 index 000000000..93d773270 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py @@ -0,0 +1,341 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for RefSeq sources (dna and peptide).""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class RefSeqGPFFParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + species_name = args["species_name"] + file = args["file"] + release_file = args["rel_file"] + xref_dbi = args["xref_dbi"] + verbose = args.get("verbose", False) + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Get needed source ids + source_ids = { + "peptide_source_id": self.get_source_id_for_source_name( + "RefSeq_peptide", xref_dbi + ), + "mrna_source_id": self.get_source_id_for_source_name( + "RefSeq_mRNA", xref_dbi, "refseq" + ), + "ncrna_source_id": self.get_source_id_for_source_name( + "RefSeq_ncRNA", xref_dbi + ), + "pred_peptide_source_id": self.get_source_id_for_source_name( + "RefSeq_peptide_predicted", xref_dbi + ), + "pred_mrna_source_id": self.get_source_id_for_source_name( + "RefSeq_mRNA_predicted", xref_dbi, "refseq" + ), + "pred_ncrna_source_id": self.get_source_id_for_source_name( + "RefSeq_ncRNA_predicted", xref_dbi + ), + "entrez_source_id": self.get_source_id_for_source_name( + "EntrezGene", xref_dbi + ), + "wiki_source_id": self.get_source_id_for_source_name("WikiGene", xref_dbi), + } + + if verbose: + logging.info( + f'RefSeq_peptide source ID = {source_ids["peptide_source_id"]}' + ) + logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna_source_id"]}') + logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna_source_id"]}') + logging.info( + f'RefSeq_peptide_predicted source ID = {source_ids["pred_peptide_source_id"]}' + ) + logging.info( + f'RefSeq_mRNA_predicted source ID = {source_ids["pred_mrna_source_id"]}' + ) + logging.info( + f'RefSeq_ncRNA_predicted source ID = {source_ids["pred_ncrna_source_id"]}' + ) + logging.info(f'EntrezGene source ID = {source_ids["entrez_source_id"]}') + logging.info(f'WikiGene source ID = {source_ids["wiki_source_id"]}') + + # Extract version from release file + if release_file: + # Parse and set release info + index = 0 + for section in self.get_file_sections(release_file, "***"): + index += 1 + if index == 2: + release = "".join(section) + release = re.sub(r"\s{2,}", " ", release) + release = release.strip() + release = re.sub( + r".*(NCBI Reference Sequence.*) Distribution.*", r"\1", release + ) + release = re.sub(r"Release (\d+)", r"Release \1,", release) + break + + # Set releases + self.set_release(source_ids["peptide_source_id"], release, xref_dbi) + self.set_release(source_ids["mrna_source_id"], release, xref_dbi) + self.set_release(source_ids["ncrna_source_id"], release, xref_dbi) + self.set_release(source_ids["pred_mrna_source_id"], release, xref_dbi) + self.set_release(source_ids["pred_ncrna_source_id"], release, xref_dbi) + self.set_release(source_ids["pred_peptide_source_id"], release, xref_dbi) + + result_message = self.create_xrefs( + source_ids, species_id, species_name, file, xref_dbi + ) + + return 0, result_message + + def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name: str, file: str, dbi: Connection) -> str: + counts = { + "num_mrna": 0, + "num_ncrna": 0, + "num_pred_mrna": 0, + "num_pred_ncrna": 0, + "num_peptide": 0, + "num_pred_peptide": 0, + "num_entrez": 0, + "num_wiki": 0, + } + + # Create a dict of all valid names for this species + species_id_to_names = self.species_id_to_names(dbi) + if species_name: + species_id_to_names.setdefault(species_id, []).append(species_name) + if not species_id_to_names.get(species_id): + return "Skipped. Could not find species ID to name mapping" + names = species_id_to_names[species_id] + name_to_species_id = {name: species_id for name in names} + + # Create a dict of all valid taxon_ids for this species + species_id_to_tax = self.species_id_to_taxonomy(dbi) + species_id_to_tax.setdefault(species_id, []).append(species_id) + tax_ids = species_id_to_tax[species_id] + tax_to_species_id = {tax_id: species_id for tax_id in tax_ids} + + # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs + entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi) + refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi) + refseq_ids.update( + self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi) + ) + entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi) + wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi) + + # Get file type + file_type = self.type_from_file(os.path.basename(file)) + if not file_type: + return f"Could not work out sequence type for {file}" + + xrefs = [] + + # Read file + for section in self.get_file_sections(file, "//\n"): + if len(section) == 1: + continue + + entry = "".join(section) + xref = {} + + # Extract the species name + species_id_check = None + match = re.search(r"\s+ORGANISM\s+(.*)\n", entry) + if match: + species = match.group(1).lower() + species = re.sub(r"^\s*", "", species) + species = re.sub(r"\s*\(.+\)", "", species) + species = re.sub(r"\s+", "_", species) + species = re.sub(r"\n", "", species) + + species_id_check = name_to_species_id[species] + + # Try going through the taxon ID if species check didn't work + if not species_id_check: + match = re.search(r"db_xref=\"taxon:(\d+)\"", entry) + if match: + taxon_id = match.group(1) + species_id_check = tax_to_species_id[taxon_id] + + # Skip xrefs for species that aren't in the species table + if not species_id_check or species_id != species_id_check: + continue + + # Extract accession and version + accession = re.search( + r"^ACCESSION\s+(\S+)", entry, flags=re.MULTILINE + ).group(1) + version = re.search(r"^VERSION\s+(\S+)", entry, flags=re.MULTILINE).group(1) + + # Get the right source ID based on file type and whether this is predicted (X*) or not + source_id = 0 + if file_type == "dna": + if re.search(r"^XM_", accession): + source_id = source_ids["pred_mrna_source_id"] + counts["num_pred_mrna"] += 1 + elif re.search(r"^XR", accession): + source_id = source_ids["pred_ncrna_source_id"] + counts["num_pred_ncrna"] += 1 + elif re.search(r"^NM", accession): + source_id = source_ids["mrna_source_id"] + counts["num_mrna"] += 1 + elif re.search(r"^NR", accession): + source_id = source_ids["ncrna_source_id"] + counts["num_ncrna"] += 1 + elif file_type == "peptide": + if re.search(r"^XP_", accession): + source_id = source_ids["pred_peptide_source_id"] + counts["num_pred_peptide"] += 1 + else: + source_id = source_ids["peptide_source_id"] + counts["num_peptide"] += 1 + + if not source_id: + logging.warning( + f"Could not get source ID for file type {file_type} for accession {accession}" + ) + + (acc_no_version, version) = version.split(".") + xref["ACCESSION"] = accession + if accession == acc_no_version: + xref["VERSION"] = version + + # Extract description (may be multi-line) + description = re.search( + r"^DEFINITION\s+([^[]+)", entry, flags=re.MULTILINE + ).group(1) + description = re.sub(r"\nACCESSION.*", "", description, flags=re.DOTALL) + description = re.sub(r"\n", "", description) + description = re.sub(r"{.*}-like", "", description) + description = re.sub(r"{.*}", "", description) + description = re.sub(r"\s+", " ", description) + if len(description) > 255: + description = description[0:255] + + # Extract sequence + sequence = re.search( + r"^\s*ORIGIN\s+(.+)", entry, flags=re.DOTALL | re.MULTILINE + ).group(1) + sequence_lines = sequence.split("\n") + parsed_sequence = "" + for seq_line in sequence_lines: + if seq_line: + sequence_only = re.search(r"^\s*\d+\s+(.*)$", seq_line).group(1) + if not sequence_only: + continue + parsed_sequence += sequence_only + parsed_sequence = re.sub(r"\s", "", parsed_sequence) + + # Extract related pair to current RefSeq accession + # For rna file, the pair is the protein_id + # For peptide file, the pair is in DBSOURCE REFSEQ accession + refseq_pair = None + match = re.search(r"DBSOURCE\s+REFSEQ: accession (\S+)", entry) + if match: + refseq_pair = match.group(1) + protein_id = re.findall(r"\/protein_id=.(\S+_\d+)", entry) + coded_by = re.findall(r"\/coded_by=.(\w+_\d+)", entry) + + for cb in coded_by: + xref["PAIR"] = cb + + if not xref.get("PAIR"): + xref["PAIR"] = refseq_pair + + if not xref.get("PAIR"): + for pi in protein_id: + xref["PAIR"] = pi + + xref["LABEL"] = f"{accession}.{version}" + xref["DESCRIPTION"] = description + xref["SOURCE_ID"] = source_id + xref["SEQUENCE"] = parsed_sequence + xref["SEQUENCE_TYPE"] = file_type + xref["SPECIES_ID"] = species_id + xref["INFO_TYPE"] = "SEQUENCE_MATCH" + xref["DEPENDENT_XREFS"] = [] + + # Extrat NCBIGene ids + seen_in_record = {} + ncbi_gene_ids = re.findall(r"db_xref=.GeneID:(\d+)", entry) + for gene_id in ncbi_gene_ids: + if not seen_in_record.get(gene_id) and entrez_acc_to_label.get(gene_id): + seen_in_record[gene_id] = 1 + + dependent = {} + dependent["SOURCE_ID"] = source_ids["entrez_source_id"] + dependent["LINKAGE_SOURCE_ID"] = source_id + dependent["ACCESSION"] = gene_id + dependent["LABEL"] = entrez_acc_to_label[gene_id] + xref["DEPENDENT_XREFS"].append(dependent) + counts["num_entrez"] += 1 + + dependent = {} + dependent["SOURCE_ID"] = source_ids["wiki_source_id"] + dependent["LINKAGE_SOURCE_ID"] = source_id + dependent["ACCESSION"] = gene_id + dependent["LABEL"] = entrez_acc_to_label[gene_id] + xref["DEPENDENT_XREFS"].append(dependent) + counts["num_wiki"] += 1 + + # Add xrefs for RefSeq mRNA as well where available + if refseq_pair: + refseq_pair = re.sub(r"\.[0-9]*", "", refseq_pair) + if refseq_pair: + if refseq_ids.get(refseq_pair): + for refseq_id in refseq_ids[refseq_pair]: + for entrez_id in entrez_ids.get(gene_id): + self.add_dependent_xref_maponly( + entrez_id, + source_ids["entrez_source_id"], + refseq_id, + None, + dbi, + ) + for wiki_id in wiki_ids.get(gene_id): + self.add_dependent_xref_maponly( + wiki_id, + source_ids["entrez_source_id"], + refseq_id, + None, + dbi, + ) + + xrefs.append(xref) + + if len(xrefs) > 0: + self.upload_xref_object_graphs(xrefs, dbi) + + result_message = f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, {counts["num_ncrna"]} ncRNA xrefs, {counts["num_pred_ncrna"]} predicted ncRNA xrefs, {counts["num_peptide"]} peptide xrefs, and {counts["num_pred_peptide"]} predicted peptide xrefs\n' + result_message += f"Added the following dependent xrefs:\n" + result_message += f'\tEntrezGene\t{counts["num_entrez"]}\n' + result_message += f'\tWikiGene\t{counts["num_wiki"]}\n' + + return result_message + + def type_from_file(self, file_name: str) -> Optional[str]: + if re.search("RefSeq_protein", file_name): + return "peptide" + if re.search("rna", file_name): + return "dna" + if re.search("protein", file_name): + return "peptide" + + return None diff --git a/src/python/ensembl/production/xrefs/parsers/UCSCParser.py b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py new file mode 100644 index 000000000..5de152912 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py @@ -0,0 +1,136 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for UCSC source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class UCSCParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + count = 0 + + file_io = self.get_filehandle(file) + csv_reader = csv.reader(file_io, delimiter="\t", strict=True) + + # Read lines + for line in csv_reader: + chromosome = line[1] + strand = line[2] + tx_start = int(line[3]) + tx_end = int(line[4]) + cds_start = int(line[5]) + cds_end = int(line[6]) + exon_starts = line[8] + exon_ends = line[9] + accession = line[11] + + # UCSC uses slightly different chromosome names, at least for + # human and mouse, so chop off the 'chr' in the beginning. We do + # not yet translate the names of the special chromosomes, e.g. + # "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl) + chromosome = re.sub(r"\Achr", "", chromosome) + + # They also use '+' and '-' for the strand, instead of -1, 0, or 1 + if strand == "+": + strand = 1 + elif strand == "-": + strand = -1 + else: + strand = 0 + + # ... and non-coding transcripts have cds_start == cds_end. + # We would like these to be stored as NULLs + if cds_start == cds_end: + cds_start = None + cds_end = None + + # exon_starts and exon_ends usually have trailing commas, remove them + exon_starts = re.sub(r",\Z", "", exon_starts) + exon_ends = re.sub(r",\Z", "", exon_ends) + + # ... and they use the same kind of "inbetween" coordinates as e.g. + # exonerate, so increment all start coordinates by one + tx_start += 1 + if cds_start: + cds_start += 1 + + # The string exon_starts is a comma-separated list of start coordinates + # for subsequent exons and we must increment each one. Split the string + # on commas, use map() to apply the "+1" transformation to every + # element of the resulting array, then join the result into a new + # comma-separated list + exon_starts = ",".join( + str(int(x) + 1) for x in re.split(r"\s*,\s*", exon_starts) + ) + + self.add_xref( + source_id, + species_id, + { + "accession": accession, + "chromosome": chromosome, + "strand": strand, + "txStart": tx_start, + "txEnd": tx_end, + "cdsStart": cds_start, + "cdsEnd": cds_end, + "exonStarts": exon_starts, + "exonEnds": exon_ends, + }, + xref_dbi, + ) + count += 1 + + file_io.close() + + result_message = f"Loaded a total of {count} UCSC xrefs" + + return 0, result_message + + def add_xref(self, source_id: int, species_id: int, xref: Dict[str, Any], dbi: Connection) -> None: + for required_key in [ + "accession", + "chromosome", + "strand", + "txStart", + "txEnd", + "exonStarts", + "exonEnds", + ]: + if not xref.get(required_key): + raise KeyError(f"Missing required key {required_key} for Xref") + + query = insert(CoordinateXrefORM).values( + source_id=source_id, + species_id=species_id, + accession=xref["accession"], + chromosome=xref["chromosome"], + strand=xref["strand"], + txStart=xref["txStart"], + txEnd=xref["txEnd"], + cdsStart=xref["cdsStart"], + cdsEnd=xref["cdsEnd"], + exonStarts=xref["exonStarts"], + exonEnds=xref["exonEnds"], + ) + dbi.execute(query) diff --git a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py new file mode 100644 index 000000000..e99b33cdc --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py @@ -0,0 +1,452 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for Uniprot sources.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + +import codecs + + +class UniProtParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + release_file = args["rel_file"] + verbose = args.get("verbose", False) + hgnc_file = args.get("hgnc_file") + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Get needed source ids + source_ids = { + "sp_source_id": self.get_source_id_for_source_name( + "Uniprot/SWISSPROT", xref_dbi, "sequence_mapped" + ), + "sptr_source_id": self.get_source_id_for_source_name( + "Uniprot/SPTREMBL", xref_dbi, "sequence_mapped" + ), + "sptr_non_display_source_id": self.get_source_id_for_source_name( + "Uniprot/SPTREMBL", xref_dbi, "protein_evidence_gt_2" + ), + "sp_direct_source_id": self.get_source_id_for_source_name( + "Uniprot/SWISSPROT", xref_dbi, "direct" + ), + "sptr_direct_source_id": self.get_source_id_for_source_name( + "Uniprot/SPTREMBL", xref_dbi, "direct" + ), + "isoform_source_id": self.get_source_id_for_source_name( + "Uniprot_isoform", xref_dbi + ), + } + + if verbose: + logging.info(f'SwissProt source ID = {source_ids["sp_source_id"]}') + logging.info(f'SpTREMBL source ID = {source_ids["sptr_source_id"]}') + logging.info( + f'SpTREMBL protein_evidence > 2 source ID = {source_ids["sptr_non_display_source_id"]}' + ) + logging.info( + f'SwissProt direct source ID = {source_ids["sp_direct_source_id"]}' + ) + logging.info( + f'SpTREMBL direct source ID = {source_ids["sptr_direct_source_id"]}' + ) + + # Parse and set release info + if release_file: + sp_release = None + sptr_release = None + + release_io = self.get_filehandle(release_file) + for line in release_io: + line = line.strip() + if not line: + continue + + match = re.search(r"(UniProtKB/Swiss-Prot Release .*)", line) + if match: + sp_release = match.group(1) + if verbose: + logging.info(f"Swiss-Prot release is {sp_release}") + else: + match = re.search(r"(UniProtKB/TrEMBL Release .*)", line) + if match: + sptr_release = match.group(1) + if verbose: + logging.info(f"SpTrEMBL release is {sptr_release}") + + release_io.close() + + # Set releases + self.set_release(source_ids["sp_source_id"], sp_release, xref_dbi) + self.set_release(source_ids["sptr_source_id"], sptr_release, xref_dbi) + self.set_release( + source_ids["sptr_non_display_source_id"], sptr_release, xref_dbi + ) + self.set_release(source_ids["sp_direct_source_id"], sp_release, xref_dbi) + self.set_release( + source_ids["sptr_direct_source_id"], sptr_release, xref_dbi + ) + + result_message = self.create_xrefs(source_ids, species_id, file, xref_dbi, hgnc_file) + + return 0, result_message + + def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, dbi: Connection, hgnc_file: str = None) -> str: + counts = { + "num_sp": 0, + "num_sptr": 0, + "num_sptr_non_display": 0, + "num_direct_sp": 0, + "num_direct_sptr": 0, + "num_isoform": 0, + } + dependent_xrefs_counts = {} + ensembl_derived_protein_count = 0 + count = 0 + + # Get sources ids of dependent sources + dependent_sources = self.get_xref_sources(dbi) + + # Extract descriptions from hgnc + hgnc_descriptions = {} + if hgnc_file: + hgnc_descriptions = self.get_hgnc_descriptions(hgnc_file) + + # Create a hash of all valid taxon_ids for this species + species_id_to_tax = self.species_id_to_taxonomy(dbi) + species_id_to_tax.setdefault(species_id, []).append(species_id) + tax_ids = species_id_to_tax[species_id] + tax_to_species_id = {tax_id: species_id for tax_id in tax_ids} + + xrefs = [] + + # Read file + for section in self.get_file_sections(file, "//\n"): + if len(section) == 1: + continue + + entry = "".join(section) + xref = {} + + # Extract the species taxon id + found = 0 + match = re.search(r"OX\s+[a-zA-Z_]+=([0-9 ,]+).*;", entry) + if match: + ox = match.group(1) + for taxon_id_from_file in ox.split(", "): + taxon_id_from_file = re.sub(r"\s", "", taxon_id_from_file) + if tax_to_species_id.get(taxon_id_from_file): + found = 1 + count += 1 + + # If no taxon_id's match, skip to next record + if not found: + continue + + # Check for CC (caution) lines containing certain text + # If sequence is from Ensembl, do not use + ensembl_derived_protein = 0 + if re.search( + r"CAUTION: The sequence shown here is derived from an Ensembl", entry + ): + ensembl_derived_protein = 1 + ensembl_derived_protein_count += 1 + + # Extract ^AC lines and build list of accessions + accessions = [] + accessions_only = re.findall(r"\nAC\s+(.+)", entry) + for accessions_line in accessions_only: + for acc in accessions_line.split(";"): + acc = acc.strip() + if acc: + accessions.append(acc) + accession = accessions[0] + + if accession.lower() == "unreviewed": + logging.warn( + f"WARNING: entries with accession of {accession} not allowed, will be skipped" + ) + continue + + xref["ACCESSION"] = accession + xref["INFO_TYPE"] = "SEQUENCE_MATCH" + xref["SYNONYMS"] = [] + for i in range(1, len(accessions)): + xref["SYNONYMS"].append(accessions[i]) + + sp_type = re.search(r"ID\s+(\w+)\s+(\w+)", entry).group(2) + protein_evidence_code = re.search(r"PE\s+(\d+)", entry).group(1) + version = re.search(r"DT\s+\d+-\w+-\d+, entry version (\d+)", entry).group( + 1 + ) + + # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here + if re.search(r"^Reviewed", sp_type, re.IGNORECASE): + xref["SOURCE_ID"] = source_ids["sp_source_id"] + counts["num_sp"] += 1 + elif re.search(r"Unreviewed", sp_type, re.IGNORECASE): + # Use normal source only if it is PE levels 1 & 2 + if protein_evidence_code and int(protein_evidence_code) < 3: + xref["SOURCE_ID"] = source_ids["sptr_source_id"] + counts["num_sptr"] += 1 + else: + xref["SOURCE_ID"] = source_ids["sptr_non_display_source_id"] + counts["num_sptr_non_display"] += 1 + else: + continue + + # Some straightforward fields + xref["LABEL"] = f"{accession}.{version}" + xref["VERSION"] = version + xref["SPECIES_ID"] = species_id + xref["SEQUENCE_TYPE"] = "peptide" + xref["STATUS"] = "experimental" + xref["DEPENDENT_XREFS"] = [] + xref["DIRECT_XREFS"] = [] + + # Extract ^DE lines only and build cumulative description string + description = "" + description_lines = re.findall(r"\nDE\s+(.+)", entry) + for line in description_lines: + match = re.search(r"RecName: Full=(.*);", line) + if match: + if description: + description += "; " + description += match.group(1) + else: + match = re.search(r"SubName: Full=(.*);", line) + if match: + if description: + description += "; " + description += match.group(1) + + description = re.sub(r"^\s*", "", description) + description = re.sub(r"\s*$", "", description) + description = re.sub(r"\s*\{ECO:.*?\}", "", description) + + # Parse the EC_NUMBER line, only for S.cerevisiae for now + if re.search(r"EC=", line) and species_id == "4932": + # Get the EC Number and make it an xref for S.cer if any + EC = re.search(r"\s*EC=([^;]+);", line).group(1) + + dependent = {} + dependent["LABEL"] = EC + dependent["ACCESSION"] = EC + dependent["SOURCE_NAME"] = "EC_NUMBER" + dependent["SOURCE_ID"] = dependent_sources["EC_NUMBER"] + dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] + xref["DEPENDENT_XREFS"].append(dependent) + dependent_xrefs_counts["EC_NUMBER"] = ( + dependent_xrefs_counts.get("EC_NUMBER", 0) + 1 + ) + + xref["DESCRIPTION"] = description + + # Extract sequence + sequence = re.search(r"SQ\s+(.+)", entry, flags=re.DOTALL).group(1) + sequence = re.sub(r"\n", "", sequence) + sequence = re.sub(r"\/\/", "", sequence) + sequence = re.sub(r"\s", "", sequence) + sequence = re.sub(r"^.*;", "", sequence) + xref["SEQUENCE"] = sequence + + # Extract gene names + gene_names = re.findall(r"\nGN\s+(.+)", entry) + gene_names = " ".join(gene_names).split(";") + + # Do not allow the addition of UniProt Gene Name dependent Xrefs + # if the protein was imported from Ensembl. Otherwise we will + # re-import previously set symbols + if not ensembl_derived_protein: + dependent = {} + name_found = 0 + gene_name = None + dep_synonyms = [] + for line in gene_names: + line = line.strip() + + if not re.search(r"Name=", line) and not re.search( + r"Synonyms=", line + ): + continue + + match = re.search(r"Name=([A-Za-z0-9_\-\.\s]+)", line) + if match and not name_found: + gene_name = match.group(1).rstrip() + gene_name = re.sub(r"\nGN", "", gene_name) + name_found = 1 + + match = re.search(r"Synonyms=(.*)", line) + if match: + synonym = match.group(1) + synonym = re.sub(r"\{.*?\}", "", synonym) + synonym = re.sub(r"\s+$", "", synonym) + synonym = re.sub(r"\s*,\s*", ",", synonym) + synonyms = synonym.split(",") + for synonym in synonyms: + if synonym not in dep_synonyms: + dep_synonyms.append(synonym) + + if gene_name: + dependent["LABEL"] = gene_name + dependent["ACCESSION"] = xref["ACCESSION"] + dependent["SOURCE_NAME"] = "Uniprot_gn" + dependent["SOURCE_ID"] = dependent_sources["Uniprot_gn"] + dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] + dependent["SYNONYMS"] = dep_synonyms + if hgnc_file and hgnc_descriptions.get(gene_name) is not None: + dependent["DESCRIPTION"] = hgnc_descriptions[gene_name] + xref["DEPENDENT_XREFS"].append(dependent) + dependent_xrefs_counts["Uniprot_gn"] = ( + dependent_xrefs_counts.get("Uniprot_gn", 0) + 1 + ) + + # Dependent xrefs - only store those that are from sources listed in the source table + deps = re.findall(r"\n(DR\s+.+)", entry) + + seen = {} + for dep in deps: + match = re.search(r"^DR\s+(.+)", dep) + if match: + vals = re.split(r";\s*", match.group(1)) + source = vals[0] + acc = vals[1] + extra = [] + if len(vals) > 2: + extra = vals[2 : len(vals)] + + # Skip external sources obtained through other files + if re.search( + r"^(GO|UniGene|RGD|CCDS|IPI|UCSC|SGD|HGNC|MGI|VGNC|Orphanet|ArrayExpress|GenomeRNAi|EPD|Xenbase|Reactome|MIM|GeneCards)", + source, + ): + continue + + # If mapped to Ensembl, add as direct xref + if source == "Ensembl": + direct = {} + isoform = {} + + stable_id = extra[0] + stable_id = re.sub(r"\.[0-9]+", "", stable_id) + direct["STABLE_ID"] = stable_id + direct["ENSEMBL_TYPE"] = "Translation" + direct["LINKAGE_TYPE"] = "DIRECT" + if xref["SOURCE_ID"] == source_ids["sp_source_id"]: + direct["SOURCE_ID"] = source_ids["sp_direct_source_id"] + counts["num_direct_sp"] += 1 + else: + direct["SOURCE_ID"] = source_ids["sptr_direct_source_id"] + counts["num_direct_sptr"] += 1 + xref["DIRECT_XREFS"].append(direct) + + match = re.search(r"(%s-[0-9]+)" % accession, extra[1]) + if match: + isoform = match.group(1) + self.add_to_direct_xrefs( + { + "stable_id": stable_id, + "ensembl_type": "translation", + "accession": isoform, + "label": isoform, + "source_id": source_ids["isoform_source_id"], + "linkage": "DIRECT", + "species_id": species_id, + }, + dbi, + ) + counts["num_isoform"] += 1 + + # Create dependent xref structure & store it + if dependent_sources.get(source): + dependent = {} + + dependent["SOURCE_NAME"] = source + dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] + dependent["SOURCE_ID"] = dependent_sources[source] + dependent["ACCESSION"] = acc + + if not seen.get(f"{source}:{acc}"): + xref["DEPENDENT_XREFS"].append(dependent) + dependent_xrefs_counts[source] = ( + dependent_xrefs_counts.get(source, 0) + 1 + ) + seen[f"{source}:{acc}"] = 1 + + if re.search(r"EMBL", dep) and not re.search(r"ChEMBL", dep): + protein_id = extra[0] + if protein_id != "-" and not seen.get( + f"{source}:{protein_id}" + ): + dependent = {} + + dependent["SOURCE_NAME"] = source + dependent["SOURCE_ID"] = dependent_sources["protein_id"] + dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] + dependent["LABEL"] = protein_id + dependent["ACCESSION"] = re.search( + r"([^.]+)\.([^.]+)", protein_id + ).group(1) + xref["DEPENDENT_XREFS"].append(dependent) + dependent_xrefs_counts[source] = ( + dependent_xrefs_counts.get(source, 0) + 1 + ) + seen[f"{source}:{protein_id}"] = 1 + + xrefs.append(xref) + + if count > 1000: + self.upload_xref_object_graphs(xrefs, dbi) + count = 0 + xrefs.clear() + + if len(xrefs) > 0: + self.upload_xref_object_graphs(xrefs, dbi) + + result_message = f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, and {counts["num_sptr_non_display"]} SPTrEMBL xrefs with protein evidence codes > 2 from {file}\n' + result_message += f'Added {counts["num_direct_sp"]} direct SwissProt xrefs and {counts["num_direct_sptr"]} direct SPTrEMBL xrefs\n' + result_message += f'Added {counts["num_isoform"]} direct isoform xrefs\n' + result_message += f"Skipped {ensembl_derived_protein_count} ensembl annotations as Gene names\n" + + result_message += f"Added the following dependent xrefs:\n" + for xref_source, xref_count in dependent_xrefs_counts.items(): + result_message += f"\t{xref_source}\t{xref_count}\n" + + return result_message + + def get_hgnc_descriptions(self, hgnc_file: str) -> Dict[str, str]: + descriptions = {} + + # Make sure the file is utf8 + hgnc_file = codecs.encode(hgnc_file, "utf-8").decode("utf-8") + hgnc_file = re.sub(r'"', '', hgnc_file) + + hgnc_io = self.get_filehandle(hgnc_file) + csv_reader = csv.DictReader(hgnc_io, delimiter="\t") + + # Read lines + for line in csv_reader: + gene_name = line["Approved symbol"] + description = line["Approved name"] + + descriptions[gene_name] = description + + hgnc_io.close() + + return descriptions \ No newline at end of file diff --git a/src/python/ensembl/production/xrefs/parsers/VGNCParser.py b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py new file mode 100644 index 000000000..21cb13d58 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py @@ -0,0 +1,93 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for VGNC source (uses HGNC Parser as parent).""" + +from ensembl.production.xrefs.parsers.HGNCParser import * + + +class VGNCParser(HGNCParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Create a hash of all valid taxon_ids for this species + species_id_to_tax = self.species_id_to_taxonomy(xref_dbi) + species_id_to_tax.setdefault(species_id, []).append(species_id) + + tax_ids = species_id_to_tax[species_id] + tax_to_species_id = {tax_id: species_id for tax_id in tax_ids} + + # Open the vgnc file + file_io = self.get_filehandle(file) + csv_reader = csv.DictReader(file_io, delimiter="\t") + + # Check if header has required columns + required_columns = [ + "taxon_id", + "ensembl_gene_id", + "vgnc_id", + "symbol", + "name", + "alias_symbol", + "prev_symbol", + ] + if not set(required_columns).issubset(set(csv_reader.fieldnames)): + raise IOError(f"Can't find required columns in VGNC file '{file}'") + + # Read lines + count = 0 + for line in csv_reader: + # Skip data for other species + if not tax_to_species_id.get(line["taxon_id"]): + continue + + # Add ensembl direct xref + if line["ensembl_gene_id"]: + self.add_to_direct_xrefs( + { + "stable_id": line["ensembl_gene_id"], + "ensembl_type": "gene", + "accession": line["vgnc_id"], + "label": line["symbol"], + "description": line["name"], + "source_id": source_id, + "species_id": species_id, + }, + xref_dbi, + ) + + self.add_synonyms_for_hgnc( + { + "source_id": source_id, + "name": line["vgnc_id"], + "species_id": species_id, + "dead": line["alias_symbol"], + "alias": line["prev_symbol"], + }, + xref_dbi, + ) + + count += 1 + + file_io.close() + + result_message = f"Loaded a total of {count} VGNC xrefs" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py new file mode 100644 index 000000000..38c8ccbda --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py @@ -0,0 +1,76 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for Xenbase source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class XenopusJamboreeParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + count = 0 + + file_io = self.get_filehandle(file) + csv_reader = csv.reader(file_io, delimiter="\t") + + # Read lines + for line in csv_reader: + accession = line[0] + label = line[1] + desc = line[2] + stable_id = line[3] + + # If there is a description, trim it a bit + if desc: + desc = self.parse_description(desc) + + if label == "unnamed": + label = accession + + self.add_to_direct_xrefs( + { + "stable_id": stable_id, + "ensembl_type": "gene", + "accession": accession, + "label": label, + "description": desc, + "source_id": source_id, + "species_id": species_id, + }, + xref_dbi, + ) + count += 1 + + file_io.close() + + result_message = f"{count} XenopusJamboreeParser xrefs succesfully parsed" + + return 0, result_message + + def parse_description(self, description: str) -> str: + # Remove some provenance information encoded in the description + description = re.sub(r"\s*\[.*\]", "", description) + + # Remove labels of type 5 of 14 from the description + description = re.sub(r",\s+\d+\s+of\s+\d+", "", description) + + return description diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py new file mode 100644 index 000000000..4e703788a --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py @@ -0,0 +1,62 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for ZFIN Descriptions.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class ZFINDescParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + count = 0 + withdrawn = 0 + + file_io = self.get_filehandle(file) + csv_reader = csv.DictReader(file_io, delimiter="\t") + csv_reader.fieldnames = ["zfin", "desc", "label", "extra1", "extra2"] + + # Read lines + for line in csv_reader: + # Skip if WITHDRAWN: this precedes both desc and label + if re.search(r"\A WITHDRAWN:", line["label"]): + withdrawn += 1 + else: + xref_id = self.add_xref( + { + "accession": line["zfin"], + "label": line["label"], + "description": line["desc"], + "source_id": source_id, + "species_id": species_id, + "info_type": "MISC", + }, + xref_dbi, + ) + count += 1 + + file_io.close() + + result_message = ( + f"{count} ZFINDesc xrefs added, {withdrawn} withdrawn entries ignored" + ) + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py new file mode 100644 index 000000000..8734d62ca --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py @@ -0,0 +1,169 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for ZFIN source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class ZFINParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Get the ZFIN source ids + direct_src_id = self.get_source_id_for_source_name( + "ZFIN_ID", xref_dbi, "direct" + ) + dependent_src_id = self.get_source_id_for_source_name( + "ZFIN_ID", xref_dbi, "uniprot/refseq" + ) + description_src_id = self.get_source_id_for_source_name( + "ZFIN_ID", xref_dbi, "description_only" + ) + + # Get the ZFIN descriptions + description = {} + query = select(XrefUORM.accession, XrefUORM.description).where( + XrefUORM.source_id == description_src_id + ) + for row in xref_dbi.execute(query).mappings().all(): + if row.description: + description[row.accession] = row.description + + # Get the Uniprot and RefSeq accessions + swiss = self.get_valid_codes("uniprot/swissprot", species_id, xref_dbi) + refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + + file_dir = os.path.dirname(file) + counts = {"direct": 0, "uniprot": 0, "refseq": 0, "synonyms": 0, "mismatch": 0} + + # Process ZFIN to ensEMBL mappings + zfin = {} + zfin_io = self.get_filehandle(os.path.join(file_dir, "ensembl_1_to_1.txt")) + zfin_csv_reader = csv.DictReader(zfin_io, delimiter="\t", strict=True) + zfin_csv_reader.fieldnames = ["zfin", "so", "label", "ensembl_id"] + for line in zfin_csv_reader: + self.add_to_direct_xrefs( + { + "stable_id": line["ensembl_id"], + "ensembl_type": "gene", + "accession": line["zfin"], + "label": line["label"], + "description": description.get(line["zfin"]), + "source_id": direct_src_id, + "species_id": species_id, + }, + xref_dbi, + ) + + zfin[line["zfin"]] = 1 + counts["direct"] += 1 + + zfin_io.close() + + # Process ZFIN to Uniprot mappings + swissprot_io = self.get_filehandle(os.path.join(file_dir, "uniprot.txt")) + swissprot_csv_reader = csv.DictReader(swissprot_io, delimiter="\t", strict=True) + swissprot_csv_reader.fieldnames = ["zfin", "so", "label", "acc"] + for line in swissprot_csv_reader: + if swiss.get(line["acc"]) and not zfin.get(line["zfin"]): + for xref_id in swiss[line["acc"]]: + self.add_dependent_xref( + { + "master_xref_id": xref_id, + "accession": line["zfin"], + "label": line["label"], + "description": description.get(line["zfin"]), + "source_id": dependent_src_id, + "species_id": species_id, + }, + xref_dbi, + ) + counts["uniprot"] += 1 + else: + counts["mismatch"] += 1 + + swissprot_io.close() + + # Process ZFIN to RefSeq mappings + refseq_io = self.get_filehandle(os.path.join(file_dir, "refseq.txt")) + refseq_csv_reader = csv.DictReader(refseq_io, delimiter="\t", strict=True) + refseq_csv_reader.fieldnames = ["zfin", "so", "label", "acc"] + for line in refseq_csv_reader: + # Ignore mappings to predicted RefSeq + if ( + re.search(r"^XP_", line["acc"]) + or re.search(r"^XM_", line["acc"]) + or re.search(r"^XR_", line["acc"]) + ): + continue + + if refseq.get(line["acc"]) and not zfin.get(line["zfin"]): + for xref_id in refseq[line["acc"]]: + self.add_dependent_xref( + { + "master_xref_id": xref_id, + "accession": line["zfin"], + "label": line["label"], + "description": description.get(line["zfin"]), + "source_id": source_id, + "species_id": species_id, + }, + xref_dbi, + ) + counts["refseq"] += 1 + else: + counts["mismatch"] += 1 + + refseq_io.close() + + # Get the added ZFINs added + zfin = self.get_valid_codes("zfin", species_id, xref_dbi) + + sources = [] + query = select(SourceUORM.source_id).where(SourceUORM.name.like("ZFIN_ID")) + for row in xref_dbi.execute(query).fetchall(): + sources.append(row[0]) + + # Process the synonyms + aliases_io = self.get_filehandle(os.path.join(file_dir, "aliases.txt")) + aliases_csv_reader = csv.DictReader(aliases_io, delimiter="\t", strict=True) + aliases_csv_reader.fieldnames = ["acc", "cur_name", "cur_symbol", "syn", "so"] + for line in aliases_csv_reader: + if zfin.get(line["acc"]): + synonym = ( + unicodedata.normalize("NFKD", line["syn"]) + .encode("ascii", "namereplace") + .decode("ascii") + ) + self.add_to_syn_for_mult_sources( + line["acc"], sources, synonym, species_id, xref_dbi + ) + counts["synonyms"] += 1 + + aliases_io.close() + + result_message = f"{counts['direct']} direct ZFIN xrefs added and\n" + result_message += f"\t{counts['uniprot']} dependent xrefs from UniProt added\n" + result_message += f"\t{counts['refseq']} dependent xrefs from RefSeq added\n" + result_message += f"\t{counts['mismatch']} dependents ignored\n" + result_message += f"\t{counts['synonyms']} synonyms loaded" + + return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/__init__.py b/src/python/ensembl/production/xrefs/parsers/__init__.py new file mode 100644 index 000000000..e58354b3f --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref parsers modules.""" diff --git a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py new file mode 100644 index 000000000..dcba51ccb --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py @@ -0,0 +1,113 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for miRBase source.""" + +from ensembl.production.xrefs.parsers.BaseParser import * + + +class miRBaseParser(BaseParser): + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args["source_id"] + species_id = args["species_id"] + species_name = args["species_name"] + file = args["file"] + xref_dbi = args["xref_dbi"] + + if not source_id or not species_id or not file: + raise AttributeError("Need to pass source_id, species_id and file as pairs") + + # Get the species name(s) + species_to_names = self.species_id_to_names(xref_dbi) + if species_name: + species_to_names.setdefault(species_id, []).append(species_name) + if not species_to_names.get(species_id): + return 0, "Skipped. Could not find species ID to name mapping" + + names = species_to_names[species_id] + name_to_species_id = {name: species_id for name in names} + + xrefs = self.create_xrefs(source_id, file, species_id, name_to_species_id) + if not xrefs: + return 0, "No xrefs added" + + self.upload_xref_object_graphs(xrefs, xref_dbi) + + result_message = "Read %d xrefs from %s" % (len(xrefs), file) + + return 0, result_message + + def create_xrefs(self, source_id: int, file: str, species_id: int, name_to_species_id: Dict[str, int]) -> List[Dict[str, Any]]: + xrefs = [] + + # Read mirbase file + for section in self.get_file_sections(file, "//\n"): + if len(section) == 1: + continue + + entry = "".join(section) + if not entry: + continue + + xref = {} + + (header, sequence) = re.split(r"\nSQ", entry, 2) + species = None + + # Extract sequence + if sequence: + seq_lines = sequence.split("\n") + seq_lines.pop(0) + + sequence = "".join(seq_lines) + sequence = sequence.upper() + sequence = re.sub("U", "T", sequence) + sequence = re.sub(r"[\d+,\s+]", "", sequence) + + # Extract name, accession, and description + name = re.search(r"^ID\s+(\S+)\s+", header, flags=re.MULTILINE).group(1) + accession = re.search(r"^AC\s+(\S+);\s+", header, flags=re.MULTILINE).group( + 1 + ) + description = re.search( + r"^DE\s+(.+)\s+stem(-|\s)loop", header, flags=re.MULTILINE + ).group(1) + + # Format description and extract species name + if description: + description_parts = re.split(r"\s+", description) + description_parts.pop() + species = " ".join(description_parts) + species = species.lower() + species = re.sub(" ", "_", species) + + # If no species match, skip to next record + species_id_check = name_to_species_id.get(species) + if not species_id_check: + continue + + if species_id and species_id == species_id_check: + xref = { + "SEQUENCE_TYPE": "dna", + "STATUS": "experimental", + "SOURCE_ID": source_id, + "ACCESSION": accession, + "LABEL": name, + "DESCRIPTION": name, + "SEQUENCE": sequence, + "SPECIES_ID": species_id, + } + xrefs.append(xref) + + return xrefs From 9047a9bcb4cdfdf09ab5b2a63df2dba0a64a6119 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 28 Oct 2024 14:49:18 +0000 Subject: [PATCH 02/12] Parsers: optimizations + unit tests --- scripts/xrefs/refseq_coordinate_parser.pl | 2 +- .../xrefs/parsers/ArrayExpressParser.py | 158 +++-- .../production/xrefs/parsers/BaseParser.py | 51 +- .../production/xrefs/parsers/CCDSParser.py | 107 ++-- .../production/xrefs/parsers/DBASSParser.py | 139 ++-- .../xrefs/parsers/EntrezGeneParser.py | 126 ++-- .../production/xrefs/parsers/HGNCParser.py | 484 ++++++-------- .../production/xrefs/parsers/HPAParser.py | 60 +- .../xrefs/parsers/JGI_ProteinParser.py | 59 +- .../production/xrefs/parsers/MGIDescParser.py | 107 ++++ .../production/xrefs/parsers/MGIParser.py | 53 +- .../xrefs/parsers/MGI_CCDS_Parser.py | 107 ---- .../xrefs/parsers/MGI_Desc_Parser.py | 101 --- .../production/xrefs/parsers/MIMParser.py | 138 ++-- .../xrefs/parsers/Mim2GeneParser.py | 125 ++-- .../production/xrefs/parsers/RFAMParser.py | 211 ++++--- .../production/xrefs/parsers/RGDParser.py | 130 ++-- .../xrefs/parsers/ReactomeParser.py | 275 ++++---- .../xrefs/parsers/RefSeqCoordinateParser.py | 125 ++-- .../xrefs/parsers/RefSeqGPFFParser.py | 341 ---------- .../production/xrefs/parsers/RefSeqParser.py | 316 ++++++++++ .../production/xrefs/parsers/UCSCParser.py | 145 ++--- .../production/xrefs/parsers/UniProtParser.py | 575 ++++++++--------- .../production/xrefs/parsers/VGNCParser.py | 89 +-- .../xrefs/parsers/XenopusJamboreeParser.py | 92 +-- .../xrefs/parsers/ZFINDescParser.py | 74 +-- .../production/xrefs/parsers/ZFINParser.py | 249 ++++---- .../production/xrefs/parsers/miRBaseParser.py | 79 ++- src/python/test/xrefs/__init__.py | 15 + src/python/test/xrefs/conftest.py | 135 ++++ .../test/xrefs/parsers/flatfiles/dbass3.txt | 9 + .../test/xrefs/parsers/flatfiles/dbass5.txt | 8 + .../xrefs/parsers/flatfiles/entrezgene.txt | 13 + .../test/xrefs/parsers/flatfiles/hgnc.txt | 21 + .../test/xrefs/parsers/flatfiles/hpa.txt | 11 + .../xrefs/parsers/flatfiles/jgi_protein.fasta | 108 ++++ .../test/xrefs/parsers/flatfiles/mgi.txt | 10 + .../test/xrefs/parsers/flatfiles/mgi_desc.txt | 11 + .../test/xrefs/parsers/flatfiles/mim.txt | 122 ++++ .../test/xrefs/parsers/flatfiles/mim2gene.txt | 10 + .../test/xrefs/parsers/flatfiles/mirbase.txt | 506 +++++++++++++++ .../parsers/flatfiles/reactome_UniProt.txt | 8 + .../parsers/flatfiles/reactome_ensembl.txt | 14 + .../parsers/flatfiles/reactome_release.txt | 1 + .../parsers/flatfiles/refseq_protein.txt | 291 +++++++++ .../parsers/flatfiles/refseq_release.txt | 94 +++ .../xrefs/parsers/flatfiles/refseq_rna.txt | 508 +++++++++++++++ .../test/xrefs/parsers/flatfiles/rfam.txt | 381 +++++++++++ .../test/xrefs/parsers/flatfiles/rgd.txt | 98 +++ .../test/xrefs/parsers/flatfiles/ucsc.txt | 10 + .../parsers/flatfiles/uniprot_release.txt | 3 + .../parsers/flatfiles/uniprot_swissprot.txt | 591 ++++++++++++++++++ .../parsers/flatfiles/uniprot_trembl.txt | 570 +++++++++++++++++ .../test/xrefs/parsers/flatfiles/vgnc.txt | 11 + .../parsers/flatfiles/xenopus_jamboree.txt | 12 + .../xrefs/parsers/flatfiles/zfin/aliases.txt | 10 + .../parsers/flatfiles/zfin/ensembl_1_to_1.txt | 10 + .../xrefs/parsers/flatfiles/zfin/refseq.txt | 10 + .../xrefs/parsers/flatfiles/zfin/uniprot.txt | 10 + .../xrefs/parsers/flatfiles/zfin_desc.txt | 9 + .../xrefs/parsers/test_arrayexpress_parser.py | 110 ++++ .../test/xrefs/parsers/test_ccds_parser.py | 91 +++ .../test/xrefs/parsers/test_dbass_parser.py | 147 +++++ .../xrefs/parsers/test_entrezgene_parser.py | 157 +++++ .../test/xrefs/parsers/test_hgnc_parser.py | 182 ++++++ .../test/xrefs/parsers/test_hpa_parser.py | 132 ++++ .../xrefs/parsers/test_jgi_protein_parser.py | 61 ++ .../xrefs/parsers/test_mgi_desc_parser.py | 148 +++++ .../test/xrefs/parsers/test_mgi_parser.py | 84 +++ .../xrefs/parsers/test_mim2gene_parser.py | 250 ++++++++ .../test/xrefs/parsers/test_mim_parser.py | 126 ++++ .../test/xrefs/parsers/test_mirbase_parser.py | 111 ++++ .../xrefs/parsers/test_reactome_parser.py | 166 +++++ .../test/xrefs/parsers/test_refseq_parser.py | 243 +++++++ .../test/xrefs/parsers/test_rfam_parser.py | 130 ++++ .../test/xrefs/parsers/test_rgd_parser.py | 126 ++++ .../test/xrefs/parsers/test_ucsc_parser.py | 89 +++ .../test/xrefs/parsers/test_uniprot_parser.py | 181 ++++++ .../test/xrefs/parsers/test_vgnc_parser.py | 96 +++ .../parsers/test_xenopus_jamboree_parser.py | 78 +++ .../xrefs/parsers/test_zfin_desc_parser.py | 63 ++ .../test/xrefs/parsers/test_zfin_parser.py | 165 +++++ src/python/test/xrefs/pytest.ini | 2 + src/python/test/xrefs/test_helpers.py | 80 +++ 84 files changed, 8813 insertions(+), 2343 deletions(-) create mode 100644 src/python/ensembl/production/xrefs/parsers/MGIDescParser.py delete mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py delete mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py delete mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py create mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqParser.py create mode 100644 src/python/test/xrefs/__init__.py create mode 100644 src/python/test/xrefs/conftest.py create mode 100644 src/python/test/xrefs/parsers/flatfiles/dbass3.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/dbass5.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/entrezgene.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/hgnc.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/hpa.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta create mode 100644 src/python/test/xrefs/parsers/flatfiles/mgi.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/mim.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/mim2gene.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/mirbase.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/reactome_release.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/refseq_release.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/rfam.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/rgd.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/ucsc.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/vgnc.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt create mode 100644 src/python/test/xrefs/parsers/test_arrayexpress_parser.py create mode 100644 src/python/test/xrefs/parsers/test_ccds_parser.py create mode 100644 src/python/test/xrefs/parsers/test_dbass_parser.py create mode 100644 src/python/test/xrefs/parsers/test_entrezgene_parser.py create mode 100644 src/python/test/xrefs/parsers/test_hgnc_parser.py create mode 100644 src/python/test/xrefs/parsers/test_hpa_parser.py create mode 100644 src/python/test/xrefs/parsers/test_jgi_protein_parser.py create mode 100644 src/python/test/xrefs/parsers/test_mgi_desc_parser.py create mode 100644 src/python/test/xrefs/parsers/test_mgi_parser.py create mode 100644 src/python/test/xrefs/parsers/test_mim2gene_parser.py create mode 100644 src/python/test/xrefs/parsers/test_mim_parser.py create mode 100644 src/python/test/xrefs/parsers/test_mirbase_parser.py create mode 100644 src/python/test/xrefs/parsers/test_reactome_parser.py create mode 100644 src/python/test/xrefs/parsers/test_refseq_parser.py create mode 100644 src/python/test/xrefs/parsers/test_rfam_parser.py create mode 100644 src/python/test/xrefs/parsers/test_rgd_parser.py create mode 100644 src/python/test/xrefs/parsers/test_ucsc_parser.py create mode 100644 src/python/test/xrefs/parsers/test_uniprot_parser.py create mode 100644 src/python/test/xrefs/parsers/test_vgnc_parser.py create mode 100644 src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py create mode 100644 src/python/test/xrefs/parsers/test_zfin_desc_parser.py create mode 100644 src/python/test/xrefs/parsers/test_zfin_parser.py create mode 100644 src/python/test/xrefs/pytest.ini create mode 100644 src/python/test/xrefs/test_helpers.py diff --git a/scripts/xrefs/refseq_coordinate_parser.pl b/scripts/xrefs/refseq_coordinate_parser.pl index 808284ee4..dae228391 100644 --- a/scripts/xrefs/refseq_coordinate_parser.pl +++ b/scripts/xrefs/refseq_coordinate_parser.pl @@ -99,7 +99,7 @@ # Not all species have refseq_import data, skip if not found if (!defined $logic_name) { print STDERR "No data found for RefSeq_import, skipping import\n";; - return; + exit 1; } # Get otherfeatures chromosomes diff --git a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py index 988b92ffa..53e78e887 100644 --- a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py @@ -14,33 +14,36 @@ """Parser module for ArrayExpress source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import logging +from typing import Dict, Any, Tuple, List, Optional +from sqlalchemy import select +from sqlalchemy.engine import URL +from ftplib import FTP +from ensembl.core.models import Gene as GeneORM + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class ArrayExpressParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - species_name = args["species_name"] - file = args["file"] - dba = args["dba"] - ensembl_release = args["ensembl_release"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + source_id = args.get("source_id") + species_id = args.get("species_id") + species_name = args.get("species_name") + xref_file = args.get("file", "") + dba = args.get("dba") + ensembl_release = args.get("ensembl_release") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not source_id or not species_id: + raise AttributeError("Missing required arguments: source_id and species_id") # Extract db connection parameters from file name - project, db_user, db_host, db_port, db_name, db_pass = ( - self.extract_params_from_string( - file, ["project", "user", "host", "port", "dbname", "pass"] - ) + project, db_user, db_host, db_port, db_name, db_pass = self.extract_params_from_string( + xref_file, ["project", "user", "host", "port", "dbname", "pass"] ) - if not db_user: - db_user = "ensro" - if not db_port: - db_port = "3306" + db_user = db_user or "ensro" + db_port = db_port or "3306" # Get the species name(s) species_id_to_names = self.species_id_to_names(xref_dbi) @@ -49,67 +52,31 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: if not species_id_to_names.get(species_id): return 0, "Skipped. Could not find species ID to name mapping" - names = species_id_to_names[species_id] # Look up the species in ftp server and check if active - species_lookup = self._get_species() - active = self._is_active(species_lookup, names, verbose) - if not active: + species_lookup = self.get_species_from_ftp() + if not self.is_arryaexpress_active(species_lookup, species_id_to_names[species_id], verbose): return 0, "Skipped. ArrayExpress source not active for species" species_name = species_id_to_names[species_id][0] # Connect to the appropriate arrayexpress db - if db_host: - arrayexpress_db_url = URL.create( - "mysql", db_user, db_pass, db_host, db_port, db_name - ) - elif project and project == "ensembl": - if verbose: - logging.info("Looking for db in mysql-ens-sta-1") - registry = "ensro@mysql-ens-sta-1:4519" - arrayexpress_db_url = self.get_db_from_registry( - species_name, "core", ensembl_release, registry - ) - elif project and project == "ensemblgenomes": - if verbose: - logging.info( - "Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2" - ) - registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160" - arrayexpress_db_url = self.get_db_from_registry( - species_name, "core", ensembl_release, registry - ) - - if not arrayexpress_db_url: - registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275" - arrayexpress_db_url = self.get_db_from_registry( - species_name, "core", ensembl_release, registry - ) - elif dba: - arrayexpress_db_url = dba - else: - arrayexpress_db_url = None + arrayexpress_db_url = self.get_arrayexpress_db_url( + project, db_user, db_pass, db_host, db_port, db_name, species_name, ensembl_release, dba, verbose + ) if not arrayexpress_db_url: - raise IOError( - f"Could not find ArrayExpress DB. Missing or unsupported project value. Supported values: ensembl, ensemblgenomes." + raise AttributeError( + "Could not find ArrayExpress DB. Missing or unsupported project value. Supported values: ensembl, ensemblgenomes." ) - else: - if verbose: - logging.info(f"Found ArrayExpress DB: {arrayexpress_db_url}") xref_count = 0 - db_engine = self.get_db_engine(arrayexpress_db_url) - with db_engine.connect() as arrayexpress_dbi: - query = select(GeneORM.stable_id).where( - GeneORM.biotype != "LRG_gene", GeneORM.is_current == 1 - ) - result = arrayexpress_dbi.execute(query).mappings().all() + # Get data from arrayexpress db + arrayexpress_data = self.get_arrayexpress_data(arrayexpress_db_url) # Add direct xref for every current gene found - for row in result: + for row in arrayexpress_data: xref_id = self.add_xref( { "accession": row.stable_id, @@ -121,41 +88,64 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: xref_dbi, ) self.add_direct_xref(xref_id, row.stable_id, "gene", "", xref_dbi) - xref_count += 1 result_message = f"Added {xref_count} DIRECT xrefs" - return 0, result_message - def _get_species(self) -> Dict[str, int]: + def get_species_from_ftp(self) -> Dict[str, bool]: ftp_server = "ftp.ebi.ac.uk" ftp_dir = "pub/databases/microarray/data/atlas/bioentity_properties/ensembl" species_lookup = {} - ftp = FTP(ftp_server) - ftp.login("anonymous", "-anonymous@") - ftp.cwd(ftp_dir) - remote_files = ftp.nlst() - ftp.close() + with FTP(ftp_server) as ftp: + ftp.login("anonymous", "-anonymous@") + ftp.cwd(ftp_dir) + remote_files = ftp.nlst() for file in remote_files: species = file.split(".")[0] - species_lookup[species] = 1 + species_lookup[species] = True return species_lookup - def _is_active(self, species_lookup: Dict[str, int], names: List[str], verbose: bool) -> bool: - # Loop through the names and aliases first. If we get a hit then great - active = False + def is_arryaexpress_active(self, species_lookup: Dict[str, bool], names: List[str], verbose: bool) -> bool: for name in names: if species_lookup.get(name): if verbose: - logging.info( - f"Found ArrayExpress has declared the name {name}. This was an alias" - ) - active = True - break + logging.info(f"Found ArrayExpress has declared the name {name}. This was an alias") + return True + return False + + def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_host: str, db_port: str, db_name: str, species_name: str, ensembl_release: str, dba: str, verbose: bool) -> Optional[URL]: + if db_host: + return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name) + elif project == "ensembl": + if verbose: + logging.info("Looking for db in mysql-ens-sta-1") + registry = "ensro@mysql-ens-sta-1:4519" + return self.get_db_from_registry(species_name, "core", ensembl_release, registry) + elif project == "ensemblgenomes": + if verbose: + logging.info("Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2") + registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160" + db_url = self.get_db_from_registry(species_name, "core", ensembl_release, registry) + if not db_url: + registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275" + return self.get_db_from_registry(species_name, "core", ensembl_release, registry) + return db_url + elif dba: + return dba - return active + return None + + def get_arrayexpress_data(self, arrayexpress_db_url: URL) -> List[Dict[str, Any]]: + db_engine = self.get_db_engine(arrayexpress_db_url) + with db_engine.connect() as arrayexpress_dbi: + query = select(GeneORM.stable_id).where( + GeneORM.biotype != "LRG_gene", GeneORM.is_current == 1 + ) + result = arrayexpress_dbi.execute(query).mappings().all() + + return result diff --git a/src/python/ensembl/production/xrefs/parsers/BaseParser.py b/src/python/ensembl/production/xrefs/parsers/BaseParser.py index 3ae7c2e2c..ad6440e37 100644 --- a/src/python/ensembl/production/xrefs/parsers/BaseParser.py +++ b/src/python/ensembl/production/xrefs/parsers/BaseParser.py @@ -14,8 +14,27 @@ """Base xref parser module to include all common functions used by xref parsers.""" -from ensembl.production.xrefs.Base import * - +import re +from sqlalchemy import select, update, func +from sqlalchemy.dialects.mysql import insert +from sqlalchemy.engine import Connection +from sqlalchemy.orm import aliased +from typing import List, Dict, Any, Optional + +from ensembl.xrefs.xref_update_db_model import ( + Source as SourceUORM, + Xref as XrefUORM, + PrimaryXref as PrimaryXrefORM, + DependentXref as DependentXrefUORM, + GeneDirectXref as GeneDirectXrefORM, + TranscriptDirectXref as TranscriptDirectXrefORM, + TranslationDirectXref as TranslationDirectXrefORM, + Synonym as SynonymORM, + Pairs as PairsORM, + Species as SpeciesORM, +) + +from ensembl.production.xrefs.Base import Base class BaseParser(Base): """Class to represent the base of xref parser modules. Inherits the xref Base class.""" @@ -67,9 +86,8 @@ def get_source_id_for_source_name(self, source_name: str, dbi: Connection, prior ) result = dbi.execute(query) - if result: - source_id = result.scalar() - else: + source_id = result.scalar() + if source_id is None: raise KeyError(f"No source_id for source_name={source_name}") return source_id @@ -117,11 +135,12 @@ def set_release(self, source_id: int, s_release: str, dbi: Connection) -> None: dbi: sqlalchemy.engine.Connection The database connection to update in """ - dbi.execute( - update(SourceUORM) - .where(SourceUORM.source_id == source_id) - .values(source_release=s_release) - ) + if s_release: + dbi.execute( + update(SourceUORM) + .where(SourceUORM.source_id == source_id) + .values(source_release=s_release) + ) def upload_xref_object_graphs(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None: """Adds xref data into a database. @@ -558,7 +577,9 @@ def add_dependent_xref_maponly(self, dependent_id: int, dependent_source_id: int linkage_annotation=master_source_id, linkage_source_id=dependent_source_id, ) - .prefix_with("IGNORE") + .on_duplicate_key_update( + linkage_source_id=dependent_source_id + ) ) self._xref_dependent_mapped[index] = master_source_id @@ -620,7 +641,9 @@ def add_synonym(self, xref_id: int, synonym: str, dbi: Connection) -> None: dbi.execute( insert(SynonymORM) .values(xref_id=xref_id, synonym=synonym) - .prefix_with("IGNORE") + .on_duplicate_key_update( + synonym=synonym + ) ) def get_ext_synonyms(self, source_name: str, dbi: Connection) -> Dict[str, List[str]]: @@ -658,7 +681,7 @@ def get_ext_synonyms(self, source_name: str, dbi: Connection) -> Dict[str, List[ ext_syns.setdefault(row.label, []).append(row.synonym) count += 1 - seen[acc_syn] = 1 + seen[acc_syn] = True return ext_syns @@ -684,7 +707,7 @@ def build_dependent_mappings(self, source_id: int, dbi: Connection) -> None: for row in dbi.execute(query).mappings().all(): self._xref_dependent_mapped[ - row.master_xref_id + "|" + row.dependent_xref_id + f"{row.master_xref_id}|{row.dependent_xref_id}" ] = row.linkage_annotation def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]: diff --git a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py index f2e258716..24d1e088c 100644 --- a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py +++ b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py @@ -14,30 +14,41 @@ """Parser module for CCDS source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import logging +from sqlalchemy import select +from sqlalchemy.engine import URL +from typing import Dict, Any, Tuple, List +from ensembl.core.models import ( + Transcript as TranscriptORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + ObjectXref as ObjectXrefCORM, +) + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class CCDSParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - dba = args["dba"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file", "") + dba = args.get("dba") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id: + raise AttributeError("Missing required arguments: source_id and species_id") # Extract db connection parameters from file db_user = "ensro" db_host, db_port, db_name, db_pass = self.extract_params_from_string( - file, ["host", "port", "dbname", "pass"] + xref_file, ["host", "port", "dbname", "pass"] ) - if not db_port: - db_port = "3306" + db_port = db_port or "3306" # Connect to the appropriate db + ccds_db_url = None if db_host: ccds_db_url = URL.create( "mysql", db_user, db_pass, db_host, db_port, db_name @@ -47,47 +58,36 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: if not ccds_db_url: return 1, "Could not find CCDS DB." - else: - if verbose: - logging.info(f"Found CCDS DB: {ccds_db_url}") + if verbose: + logging.info(f"Found CCDS DB: {ccds_db_url}") # Get data from ccds db - db_engine = self.get_db_engine(ccds_db_url) - with db_engine.connect() as ccds_dbi: - query = ( - select(TranscriptORM.stable_id, XrefCORM.dbprimary_acc) - .where( - XrefCORM.xref_id == ObjectXrefCORM.xref_id, - ObjectXrefCORM.ensembl_object_type == "Transcript", - ObjectXrefCORM.ensembl_id == TranscriptORM.transcript_id, - ExternalDbORM.external_db_id == XrefCORM.external_db_id, - ) - .filter(ExternalDbORM.db_name.like("Ens_%_transcript")) - ) - result = ccds_dbi.execute(query).mappings().all() + ccds_data = self.get_ccds_data(ccds_db_url) xref_count, direct_count = 0, 0 seen = {} - for row in result: + for row in ccds_data: stable_id = row.stable_id display_label = row.dbprimary_acc - (acc, version) = display_label.split(".") - - if not seen.get(display_label): - xref_id = self.add_xref( - { - "accession": acc, - "version": version, - "label": display_label, - "source_id": source_id, - "species_id": species_id, - "info_type": "DIRECT", - }, - xref_dbi, - ) - + if "." in display_label: + acc, version = display_label.split(".") + else: + acc, version = display_label, None + + if display_label not in seen: + xref_args = { + "accession": acc, + "label": display_label, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + } + if version is not None: + args["version"] = version + + xref_id = self.add_xref(xref_args, xref_dbi) xref_count += 1 seen[display_label] = xref_id else: @@ -96,6 +96,23 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: self.add_direct_xref(xref_id, stable_id, "Transcript", "", xref_dbi) direct_count += 1 - result_message = f"Parsed CCDS identifiers from {file}, added {xref_count} xrefs and {direct_count} direct_xrefs" + result_message = f"Parsed CCDS identifiers, added {xref_count} xrefs and {direct_count} direct_xrefs" return 0, result_message + + def get_ccds_data(self, ccds_db_url: str) -> List[Dict[str, Any]]: + db_engine = self.get_db_engine(ccds_db_url) + with db_engine.connect() as ccds_dbi: + query = ( + select(TranscriptORM.stable_id, XrefCORM.dbprimary_acc) + .join(ObjectXrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id) + .join(TranscriptORM, ObjectXrefCORM.ensembl_id == TranscriptORM.transcript_id) + .join(ExternalDbORM, ExternalDbORM.external_db_id == XrefCORM.external_db_id) + .where( + ObjectXrefCORM.ensembl_object_type == "Transcript", + ExternalDbORM.db_name.like("Ens_%_transcript") + ) + ) + result = ccds_dbi.execute(query).mappings().all() + + return result \ No newline at end of file diff --git a/src/python/ensembl/production/xrefs/parsers/DBASSParser.py b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py index 9f3f6243a..0a453a029 100644 --- a/src/python/ensembl/production/xrefs/parsers/DBASSParser.py +++ b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py @@ -14,101 +14,84 @@ """Parser module for DBASS sources.""" -from ensembl.production.xrefs.parsers.BaseParser import * - -EXPECTED_NUMBER_OF_COLUMNS = 23 +import csv +import re +from typing import Any, Dict, Optional, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class DBASSParser(BaseParser): + EXPECTED_NUMBER_OF_COLUMNS = 23 + SLASH_PATTERN = re.compile(r"(.*)\s?/\s?(.*)", re.IGNORECASE | re.DOTALL) + PARENS_PATTERN = re.compile(r"(.*)\s?\((.*)\)", re.IGNORECASE | re.DOTALL) + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args.get("source_id") + source_id = args.get("source_id") species_id = args.get("species_id") - xref_file = args.get("file") - xref_dbi = args.get("xref_dbi") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") if not source_id or not species_id or not xref_file: - raise AttributeError("Need to pass source_id, species_id and file") + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"DBASS file is empty") + file_io.seek(0) + + csv_reader = csv.reader(file_io) + header = next(csv_reader) + patterns = [r"^id$", r"^genesymbol$", None, r"^ensemblreference$"] + if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise ValueError(f"Malformed or unexpected header in DBASS file {xref_file}") - file_io = self.get_filehandle(xref_file) - csv_reader = csv.reader(file_io) + processed_count, unmapped_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi) - # Check if header is valid - header = next(csv_reader) - patterns = [r"^id$", r"^genesymbol$", None, r"^ensemblreference$"] - if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): - raise IOError(f"Malformed or unexpected header in DBASS file {xref_file}") + result_message = f"{processed_count} direct xrefs successfully processed\n" + result_message += f"Skipped {unmapped_count} unmapped xrefs" + return 0, result_message + def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int]: processed_count = 0 unmapped_count = 0 - # Read lines for line in csv_reader: if not line: continue - if len(line) < EXPECTED_NUMBER_OF_COLUMNS: - line_number = 2 + processed_count + unmapped_count - raise IOError( - f"Line {line_number} of input file {xref_file} has an incorrect number of columns" - ) - - dbass_gene_id = line[0] - dbass_gene_name = line[1] - dbass_full_name = line[2] - ensembl_id = line[3] - - # Do not attempt to create unmapped xrefs. Checking truthiness is good - # enough here because the only non-empty string evaluating as false is - # not a valid Ensembl stable ID. - if ensembl_id: - # DBASS files list synonyms in two ways: either "FOO (BAR)" (with or - # without space) or "FOO/BAR". Both forms are relevant to us. - match = re.search( - r"(.*)\s?/\s?(.*)", dbass_gene_name, re.IGNORECASE | re.DOTALL - ) - if match: - first_gene_name = match.group(1) - second_gene_name = match.group(2) - else: - match = re.search( - r"(.*)\s?\((.*)\)", dbass_gene_name, re.IGNORECASE | re.DOTALL - ) - if match: - first_gene_name = match.group(1) - second_gene_name = match.group(2) - else: - first_gene_name = dbass_gene_name - second_gene_name = None - - label = first_gene_name - synonym = second_gene_name - ensembl_type = "gene" - version = "1" - - xref_id = self.add_xref( - { - "accession": dbass_gene_id, - "version": version, - "label": label, - "source_id": source_id, - "species_id": species_id, - "info_type": "DIRECT", - }, - xref_dbi, - ) - - if synonym: - self.add_synonym(xref_id, synonym, xref_dbi) - - self.add_direct_xref(xref_id, ensembl_id, ensembl_type, "", xref_dbi) - - processed_count += 1 - else: - unmapped_count += 1 + if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS: + raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns") - file_io.close() + dbass_gene_id, dbass_gene_name, dbass_full_name, ensembl_id = line[:4] - result_message = f"{processed_count} direct xrefs successfully processed\n" - result_message += f"Skipped {unmapped_count} unmapped xrefs" + if not ensembl_id.strip(): + unmapped_count += 1 + continue - return 0, result_message + first_gene_name, second_gene_name = self.extract_gene_names(dbass_gene_name) + xref_id = self.add_xref( + { + "accession": dbass_gene_id, + "version": "1", + "label": first_gene_name, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, ensembl_id, "gene", "", xref_dbi) + + if second_gene_name: + self.add_synonym(xref_id, second_gene_name, xref_dbi) + + processed_count += 1 + + return processed_count, unmapped_count + + def extract_gene_names(self, dbass_gene_name: str) -> Tuple[Optional[str], Optional[str]]: + match = self.SLASH_PATTERN.search(dbass_gene_name) or self.PARENS_PATTERN.search(dbass_gene_name) + if match: + return match.groups() + return dbass_gene_name, None diff --git a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py index 33a7328a2..699c633ba 100644 --- a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py +++ b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py @@ -14,70 +14,84 @@ """Parser module for EntrezGene and WikiGene sources.""" -from ensembl.production.xrefs.parsers.BaseParser import * - -EXPECTED_NUMBER_OF_COLUMNS = 16 +import csv +import logging +import re +from typing import Any, Dict, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class EntrezGeneParser(BaseParser): + EXPECTED_NUMBER_OF_COLUMNS = 16 + SYNONYM_SPLITTER = re.compile(r"\|") + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") - - wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi) - if verbose: - logging.info(f"Wiki source id = {wiki_source_id}") - - file_io = self.get_filehandle(file) - csv_reader = csv.reader(file_io, delimiter="\t") - - # Check if header is valid - header = next(csv_reader) - patterns = [ - r"\A[#]?\s*tax_id", - "geneid", - "symbol", - "locustag", - "synonyms", - "dbxrefs", - "chromosome", - "map_location", - "description", - "type_of_gene", - "symbol_from_nomenclature_authority", - "full_name_from_nomenclature_authority", - "nomenclature_status", - "other_designations", - "modification_date", - "feature_type", - ] - if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): - raise IOError(f"Malformed or unexpected header in EntrezGene file {file}") - - xref_count = 0 + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError("EntrezGene file is empty") + file_io.seek(0) + + csv_reader = csv.reader(file_io, delimiter="\t") + header = next(csv_reader) + patterns = [ + r"\A[#]?\s*tax_id$", + r"^geneid$", + r"^symbol$", + r"^locustag$", + r"^synonyms$", + r"^dbxrefs$", + r"^chromosome$", + r"^map_location$", + r"^description$", + r"^type_of_gene$", + r"^symbol_from_nomenclature_authority$", + r"^full_name_from_nomenclature_authority$", + r"^nomenclature_status$", + r"^other_designations$", + r"^modification_date$", + r"^feature_type$", + ] + if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise ValueError(f"Malformed or unexpected header in EntrezGene file {xref_file}") + + wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi) + if verbose: + logging.info(f"Wiki source id = {wiki_source_id}") + + processed_count, syn_count = self.process_lines(csv_reader, source_id, species_id, wiki_source_id, xref_dbi) + + result_message = f"{processed_count} EntrezGene Xrefs and {processed_count} WikiGene Xrefs added with {syn_count} synonyms" + return 0, result_message + + def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, wiki_source_id: int, xref_dbi: Connection) -> Tuple[int, int]: + processed_count = 0 syn_count = 0 seen = {} - # Read lines for line in csv_reader: if not line: continue - tax_id = line[0] + if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS: + raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns") + + tax_id = int(line[0]) acc = line[1] symbol = line[2] synonyms = line[4] desc = line[8] - if tax_id != species_id: - continue - if seen.get(acc): + if tax_id != species_id or acc in seen: continue xref_id = self.add_xref( @@ -103,18 +117,14 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: xref_dbi, ) - xref_count += 1 + processed_count += 1 - syns = re.split(r"\|", synonyms) - for synonym in syns: - if synonym != "-": + if synonyms.strip() != "-": + syns = self.SYNONYM_SPLITTER.split(synonyms) + for synonym in syns: self.add_synonym(xref_id, synonym, xref_dbi) syn_count += 1 - seen[acc] = 1 - - file_io.close() + seen[acc] = True - result_message = f"{xref_count} EntrezGene Xrefs and {xref_count} WikiGene Xrefs added with {syn_count} synonyms" - - return 0, result_message + return processed_count, syn_count diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py index 9bcda9cbd..b8bca4e45 100644 --- a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py +++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py @@ -14,130 +14,157 @@ """Parser module for HGNC source.""" -from ensembl.production.xrefs.parsers.BaseParser import * -from unidecode import unidecode +from typing import Any, Dict, List, Tuple, Optional +import csv +import logging +import re +import requests import codecs +from sqlalchemy import select +from sqlalchemy.engine import Connection +from sqlalchemy.engine.url import URL +from unidecode import unidecode +from ensembl.core.models import ( + Transcript as TranscriptORM, + AttribType as AttribTypeORM, + TranscriptAttrib as TranscriptAttribORM, +) + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class HGNCParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - dba = args["dba"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + dba = args.get("dba") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") # Parse the file string and set default user - file_params = self.parse_file_string(file) - if not file_params.get("user"): - file_params["user"] = "ensro" - - # Prepare lookup lists - swissprot = self.get_valid_codes("Uniprot/SWISSPROT", species_id, xref_dbi) - refseq = self.get_valid_codes("refseq", species_id, xref_dbi) - source_list = ["refseq_peptide", "refseq_mRNA"] - entrezgene = self.get_valid_xrefs_for_dependencies( - "EntrezGene", source_list, xref_dbi - ) + file_params = self.parse_file_string(xref_file) + file_params.setdefault("user", "ensro") # Prepare sources self_source_name = self.get_source_name_for_source_id(source_id, xref_dbi) source_ids = { - "ccds": self.get_source_id_for_source_name( - self_source_name, xref_dbi, "ccds" - ), - "entrezgene_manual": self.get_source_id_for_source_name( - self_source_name, xref_dbi, "entrezgene_manual" - ), - "refseq_manual": self.get_source_id_for_source_name( - self_source_name, xref_dbi, "refseq_manual" - ), - "ensembl_manual": self.get_source_id_for_source_name( - self_source_name, xref_dbi, "ensembl_manual" - ), - "desc_only": self.get_source_id_for_source_name( - self_source_name, xref_dbi, "desc_only" - ), + "ccds": self.get_source_id_for_source_name(self_source_name, xref_dbi, "ccds"), + "entrezgene_manual": self.get_source_id_for_source_name(self_source_name, xref_dbi, "entrezgene_manual"), + "refseq_manual": self.get_source_id_for_source_name(self_source_name, xref_dbi, "refseq_manual"), + "ensembl_manual": self.get_source_id_for_source_name(self_source_name, xref_dbi, "ensembl_manual"), + "desc_only": self.get_source_id_for_source_name(self_source_name, xref_dbi, "desc_only"), "lrg": self.get_source_id_for_source_name("LRG_HGNC_notransfer", xref_dbi), "genecards": self.get_source_id_for_source_name("GeneCards", xref_dbi), } # Statistics counts - name_count = { - "ccds": 0, - "lrg": 0, - "ensembl_manual": 0, - "genecards": 0, - "refseq_manual": 0, - "entrezgene_manual": 0, - } - mismatch = 0 + name_count = {key: 0 for key in source_ids} # Connect to the ccds db - ccds_db_url = None - if dba: - ccds_db_url = dba - elif file_params.get("host"): - ccds_db_url = URL.create( - "mysql", - file_params["user"], - file_params["pass"], - file_params["host"], - file_params["port"], - file_params["dbname"], - ) - else: - raise AttributeError("No ensembl ccds database provided") - + ccds_db_url = dba or self.construct_db_url(file_params) if not ccds_db_url: raise AttributeError("No ensembl ccds database provided") - else: - if verbose: - logging.info(f"Found ccds DB: {ccds_db_url}") - - # Get CCDS data - db_engine = self.get_db_engine(ccds_db_url) - with db_engine.connect() as ccds_dbi: - query = ( - select(TranscriptAttribORM.value, TranscriptORM.stable_id) - .join( - TranscriptAttribORM, - TranscriptORM.transcript_id == TranscriptAttribORM.transcript_id, - ) - .join( - AttribTypeORM, - TranscriptAttribORM.attrib_type_id == AttribTypeORM.attrib_type_id, - ) - .where(AttribTypeORM.code == "ccds_transcript") - ) - result = ccds_dbi.execute(query).mappings().all() - - ccds_to_ens = {} - for row in result: - # Remove version - ccds_id = re.sub(r"\.\d+", "", row.value) - - ccds_to_ens[ccds_id] = row.stable_id + if verbose: + logging.info(f"Found ccds DB: {ccds_db_url}") # Get HGNC file (wget or disk) - mem_file = file - if file_params.get("wget"): - response = requests.get(file_params["wget"]) - if not response.ok: - raise IOError(response.reason) - mem_file = response.text + mem_file = self.fetch_file(file_params, xref_file) # Make sure the file is utf8 mem_file = codecs.encode(mem_file, "utf-8").decode("utf-8") mem_file = re.sub(r'"', '', mem_file) - file_io = self.get_filehandle(mem_file) - csv_reader = csv.DictReader(file_io, delimiter="\t") + with self.get_filehandle(mem_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"HGNC file is empty") + file_io.seek(0) + + csv_reader = csv.DictReader(file_io, delimiter="\t") + + syn_count = self.process_lines(csv_reader, source_ids, name_count, species_id, ccds_db_url, xref_dbi) + + result_message = "HGNC xrefs loaded:\n" + for count_type, count in name_count.items(): + if count_type == "desc_only": continue + result_message += f"\t{count_type}\t{count}\n" + result_message += f"{syn_count} synonyms added\n" + result_message += f"{name_count['desc_only']} HGNC ids could not be associated in xrefs" + + return 0, result_message + + def process_lines(self, csv_reader: csv.DictReader, source_ids: Dict[str, int], name_count: Dict[str, int], species_id: int, ccds_db_url: str, xref_dbi: Connection) -> int: + # Prepare lookup lists + refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + source_list = ["refseq_peptide", "refseq_mRNA"] + entrezgene = self.get_valid_xrefs_for_dependencies("EntrezGene", source_list, xref_dbi) + + # Get CCDS data + ccds_to_ens = self.get_ccds_to_ens_mapping(ccds_db_url) + + synonym_count = 0 + + # Helper function to add direct xrefs and synonyms + def add_direct_xref_and_synonyms(source_key: str, accession: str, symbol: str, feature_id: str, name: str, previous_symbols: str, synonyms: str) -> Tuple[int, int]: + xref_id = self.add_xref( + { + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids[source_key], + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, feature_id, "gene", "", xref_dbi) + name_count[source_key] += 1 + + count = self.add_synonyms_for_hgnc( + { + "source_id": source_ids[source_key], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + return xref_id, count + + # Helper function to add dependent xrefs and synonyms + def add_dependent_xref_and_synonyms(source_key: str, master_xrefs: List[int], accession: str, symbol: str, name: str, previous_symbols: str, synonyms: str) -> int: + for xref_id in master_xrefs: + self.add_dependent_xref( + { + "master_xref_id": xref_id, + "accession": accession, + "label": symbol, + "description": name, + "source_id": source_ids[source_key], + "species_id": species_id, + }, + xref_dbi, + ) + name_count[source_key] += 1 + + count = self.add_synonyms_for_hgnc( + { + "source_id": source_ids[source_key], + "name": accession, + "species_id": species_id, + "dead": previous_symbols, + "alias": synonyms, + }, + xref_dbi, + ) + + return count # Read lines for line in csv_reader: @@ -147,198 +174,53 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: previous_symbols = line["Previous symbols"] synonyms = line["Alias symbols"] - seen = 0 + seen = False # Direct CCDS to ENST mappings - ccds = line["CCDS IDs"] - ccds_list = [] - if ccds: - ccds_list = re.split(r",\s", ccds) - + ccds_list = re.split(r",\s", line["CCDS IDs"]) if line["CCDS IDs"] else [] for ccds in ccds_list: enst_id = ccds_to_ens.get(ccds) if not enst_id: continue - self.add_to_direct_xrefs( - { - "stable_id": enst_id, - "ensembl_type": "gene", - "accession": accession, - "label": symbol, - "description": name, - "source_id": source_ids["ccds"], - "species_id": species_id, - }, - xref_dbi, - ) - self.add_synonyms_for_hgnc( - { - "source_id": source_ids["ccds"], - "name": accession, - "species_id": species_id, - "dead": previous_symbols, - "alias": synonyms, - }, - xref_dbi, - ) - - name_count["ccds"] += 1 + direct_xref_id, syn_count = add_direct_xref_and_synonyms("ccds", accession, symbol, enst_id, name, previous_symbols, synonyms) + synonym_count += syn_count # Direct LRG to ENST mappings - lrg_id = line["Locus specific databases"] + lrg_id = self.extract_lrg_id(line["Locus specific databases"]) if lrg_id: - match = re.search(r"(LRG_\d+)\|", lrg_id) - if match: - lrg_id = match.group(1) - - self.add_to_direct_xrefs( - { - "stable_id": lrg_id, - "ensembl_type": "gene", - "accession": accession, - "label": symbol, - "description": name, - "source_id": source_ids["lrg"], - "species_id": species_id, - }, - xref_dbi, - ) - self.add_synonyms_for_hgnc( - { - "source_id": source_ids["lrg"], - "name": accession, - "species_id": species_id, - "dead": previous_symbols, - "alias": synonyms, - }, - xref_dbi, - ) - - name_count["lrg"] += 1 + direct_xref_id, syn_count = add_direct_xref_and_synonyms("lrg", accession, symbol, lrg_id, name, previous_symbols, synonyms) + synonym_count += syn_count # Direct Ensembl mappings ensg_id = line["Ensembl gene ID"] if ensg_id: - seen = 1 - - self.add_to_direct_xrefs( - { - "stable_id": ensg_id, - "ensembl_type": "gene", - "accession": accession, - "label": symbol, - "description": name, - "source_id": source_ids["ensembl_manual"], - "species_id": species_id, - }, - xref_dbi, - ) - self.add_synonyms_for_hgnc( - { - "source_id": source_ids["ensembl_manual"], - "name": accession, - "species_id": species_id, - "dead": previous_symbols, - "alias": synonyms, - }, - xref_dbi, - ) + seen = True - name_count["ensembl_manual"] += 1 + direct_xref_id, syn_count = add_direct_xref_and_synonyms("ensembl_manual", accession, symbol, ensg_id, name, previous_symbols, synonyms) + synonym_count += syn_count # GeneCards - direct_id = self.get_xref_id( - accession, source_ids["ensembl_manual"], species_id, xref_dbi - ) hgnc_id = re.search(r"HGNC:(\d+)", accession).group(1) - - self.add_dependent_xref( - { - "master_xref_id": direct_id, - "accession": hgnc_id, - "label": symbol, - "description": name, - "source_id": source_ids["genecards"], - "species_id": species_id, - }, - xref_dbi, - ) - self.add_synonyms_for_hgnc( - { - "source_id": source_ids["genecards"], - "name": hgnc_id, - "species_id": species_id, - "dead": previous_symbols, - "alias": synonyms, - }, - xref_dbi, - ) - - name_count["genecards"] += 1 + synonym_count += add_dependent_xref_and_synonyms("genecards", [direct_xref_id], hgnc_id, symbol, name, previous_symbols, synonyms) # RefSeq refseq_id = line["RefSeq IDs"] if refseq_id and refseq.get(refseq_id): - seen = 1 - - for xref_id in refseq[refseq_id]: - self.add_dependent_xref( - { - "master_xref_id": xref_id, - "accession": accession, - "label": symbol, - "description": name, - "source_id": source_ids["refseq_manual"], - "species_id": species_id, - }, - xref_dbi, - ) - name_count["refseq_manual"] += 1 - - self.add_synonyms_for_hgnc( - { - "source_id": source_ids["refseq_manual"], - "name": accession, - "species_id": species_id, - "dead": previous_symbols, - "alias": synonyms, - }, - xref_dbi, - ) + seen = True + + synonym_count += add_dependent_xref_and_synonyms("refseq_manual", refseq[refseq_id], accession, symbol, name, previous_symbols, synonyms) # EntrezGene entrez_id = line["NCBI Gene ID"] if entrez_id and entrezgene.get(entrez_id): - seen = 1 - - self.add_dependent_xref( - { - "master_xref_id": entrezgene[entrez_id], - "accession": accession, - "label": symbol, - "description": name, - "source_id": source_ids["entrezgene_manual"], - "species_id": species_id, - }, - xref_dbi, - ) - self.add_synonyms_for_hgnc( - { - "source_id": source_ids["entrezgene_manual"], - "name": accession, - "species_id": species_id, - "dead": previous_symbols, - "alias": synonyms, - }, - xref_dbi, - ) + seen = True - name_count["entrezgene_manual"] += 1 + synonym_count += add_dependent_xref_and_synonyms("entrezgene_manual", [entrezgene[entrez_id]], accession, symbol, name, previous_symbols, synonyms) # Store to keep descriptions if not stored yet if not seen: - xref_id = self.add_xref( + self.add_xref( { "accession": accession, "label": symbol, @@ -349,7 +231,9 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: }, xref_dbi, ) - self.add_synonyms_for_hgnc( + name_count["desc_only"] += 1 + + synonym_count += self.add_synonyms_for_hgnc( { "source_id": source_ids["desc_only"], "name": accession, @@ -359,24 +243,18 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: }, xref_dbi, ) - mismatch += 1 - file_io.close() + return synonym_count - result_message = "HGNC xrefs loaded:\n" - for count_type, count in name_count.items(): - result_message += f"\t{count_type}\t{count}\n" - result_message += f"{mismatch} HGNC ids could not be associated in xrefs" - - return 0, result_message - - def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None: - source_id = args["source_id"] - name = args["name"] - species_id = args["species_id"] - dead_string = args.get("dead") + def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> int: + source_id = args["source_id"] + name = args["name"] + species_id = args["species_id"] + dead_string = args.get("dead") alias_string = args.get("alias") + syn_count = 0 + # Dead name, add to synonym if dead_string: dead_string = re.sub('"', "", dead_string) @@ -388,8 +266,8 @@ def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None: except: pass dead = unidecode(dead.upper()) - self.add_to_syn(name, source_id, dead, species_id, dbi) + syn_count += 1 # Alias name, add to synonym if alias_string: @@ -402,20 +280,72 @@ def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None: except: pass alias = unidecode(alias.upper()) - self.add_to_syn(name, source_id, alias, species_id, dbi) + syn_count += 1 + + return syn_count def parse_file_string(self, file_string: str) -> Dict[str, str]: - # file_string = re.sub(r"\A\w+:", "", file_string) file_string = re.sub(r"^\w+:", "", file_string) - param_pairs = file_string.split(",") params = {} # Set provided values for pair in param_pairs: - if re.search("=>", pair): + if "=>" in pair: key, value = pair.split("=>") params[key] = value return params + + def construct_db_url(self, file_params: Dict[str, str]) -> Optional[URL]: + if file_params.get("host"): + return URL.create( + "mysql", + file_params["user"], + file_params["pass"], + file_params["host"], + file_params["port"], + file_params["dbname"], + ) + return None + + def get_ccds_to_ens_mapping(self, ccds_url: str) -> Dict[str, str]: + db_engine = self.get_db_engine(ccds_url) + with db_engine.connect() as ccds_dbi: + query = ( + select(TranscriptAttribORM.value, TranscriptORM.stable_id) + .join( + TranscriptAttribORM, + TranscriptORM.transcript_id == TranscriptAttribORM.transcript_id, + ) + .join( + AttribTypeORM, + TranscriptAttribORM.attrib_type_id == AttribTypeORM.attrib_type_id, + ) + .where(AttribTypeORM.code == "ccds_transcript") + ) + result = ccds_dbi.execute(query).mappings().all() + + ccds_to_ens = {} + for row in result: + ccds_id = re.sub(r"\.\d+", "", row.value) # Remove version + + ccds_to_ens[ccds_id] = row.stable_id + + return ccds_to_ens + + def fetch_file(self, file_params: Dict[str, str], file: str) -> str: + if file_params.get("wget"): + response = requests.get(file_params["wget"]) + if not response.ok: + raise IOError(response.reason) + return response.text + return file + + def extract_lrg_id(self, lrg_id: str) -> Optional[str]: + if lrg_id: + match = re.search(r"(LRG_\d+)\|", lrg_id) + if match: + return match.group(1) + return None diff --git a/src/python/ensembl/production/xrefs/parsers/HPAParser.py b/src/python/ensembl/production/xrefs/parsers/HPAParser.py index 76c99d769..f1268f047 100644 --- a/src/python/ensembl/production/xrefs/parsers/HPAParser.py +++ b/src/python/ensembl/production/xrefs/parsers/HPAParser.py @@ -15,60 +15,64 @@ """Parser module for HPA source.""" import csv +from typing import Dict, Any, Tuple +from sqlalchemy.engine import Connection from ensembl.production.xrefs.parsers.BaseParser import BaseParser -EXPECTED_NUMBER_OF_COLUMNS = 4 - - class HPAParser(BaseParser): + EXPECTED_NUMBER_OF_COLUMNS = 4 + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError("HPA file is empty") + file_io.seek(0) - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + csv_reader = csv.reader(file_io, delimiter=",", strict=True) + header = next(csv_reader) + patterns = [r"^antibody$", r"^antibody_id$", r"^ensembl_peptide_id$", r"^link$"] + if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise ValueError(f"Malformed or unexpected header in HPA file {xref_file}") - file_io = self.get_filehandle(file) - csv_reader = csv.reader(file_io, delimiter=",", strict=True) + parsed_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi) - # Check if header is valid - header = next(csv_reader) - patterns = ["antibody", "antibody_id", "ensembl_peptide_id", "link"] - if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): - raise IOError(f"Malformed or unexpected header in HPA file {file}") + result_message = f"{parsed_count} direct xrefs successfully parsed" + return 0, result_message + def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection) -> int: parsed_count = 0 - # Read lines for line in csv_reader: if not line: continue - antibody_name = line[0] - antibody_id = line[1] - ensembl_id = line[2] + if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS: + raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns") + + antibody_name, antibody_id, ensembl_id = line[:3] - self.add_to_direct_xrefs( + xref_id = self.add_xref( { "accession": antibody_id, "version": "1", "label": antibody_name, - "stable_id": ensembl_id, - "ensembl_type": "translation", "source_id": source_id, "species_id": species_id, "info_type": "DIRECT", }, xref_dbi, ) + self.add_direct_xref(xref_id, ensembl_id, "translation", "", xref_dbi) parsed_count += 1 - file_io.close() - - result_message = f"{parsed_count} direct xrefs succesfully parsed" - - return 0, result_message + return parsed_count diff --git a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py index 8ce883d1d..94dc7466a 100644 --- a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py +++ b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py @@ -14,47 +14,54 @@ """Parser module for JGI source.""" -from ensembl.production.xrefs.parsers.BaseParser import * - +import re from Bio import SeqIO +from typing import Dict, Any, Tuple +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class JGI_ProteinParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") xrefs = [] - file_io = self.get_filehandle(file) - fasta_sequences = SeqIO.parse(file_io, "fasta") + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"JGIProtein file is empty") + file_io.seek(0) - for fasta in fasta_sequences: - accession = fasta.id - sequence = fasta.seq + fasta_sequences = SeqIO.parse(file_io, "fasta") - # Extract accession value - accession = re.search(r"^ci0100(\w+?)$", accession).group(1) + for fasta in fasta_sequences: + accession = fasta.id + sequence = str(fasta.seq) - # Build an xref object and store it - xref = { - "ACCESSION": accession, - "SEQUENCE": sequence, - "SOURCE_ID": source_id, - "SPECIES_ID": species_id, - "SEQUENCE_TYPE": "peptide", - } - xrefs.append(xref) + # Extract accession value + match = re.search(r"^ci0100(\w+?)$", accession) + if not match: + continue + accession = match.group(1) - file_io.close() + # Build an xref object and store it + xref = { + "ACCESSION": accession, + "SEQUENCE": sequence, + "SOURCE_ID": source_id, + "SPECIES_ID": species_id, + "SEQUENCE_TYPE": "peptide", + "INFO_TYPE": "SEQUENCE_MATCH", + } + xrefs.append(xref) self.upload_xref_object_graphs(xrefs, xref_dbi) - result_message = "%d JGI_ xrefs succesfully parsed" % len(xrefs) + result_message = f"{len(xrefs)} JGI_ xrefs successfully parsed" return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MGIDescParser.py b/src/python/ensembl/production/xrefs/parsers/MGIDescParser.py new file mode 100644 index 000000000..a348c7227 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/MGIDescParser.py @@ -0,0 +1,107 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for MGI Descriptions.""" + +import csv +import logging +import re +from typing import Any, Dict, Tuple +from sqlalchemy.engine import Connection + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser + +class MGIDescParser(BaseParser): + EXPECTED_NUMBER_OF_COLUMNS = 12 + SYNONYM_SPLITTER = re.compile(r"[|]") + + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError("MGI_desc file is empty") + file_io.seek(0) + + csv_reader = csv.reader(file_io, delimiter="\t", strict=True, quotechar=None, escapechar=None) + header = next(csv_reader) + patterns = [ + r"^mgi accession id$", + r"^chr$", + r"^cm position$", + r"^genome coordinate start$", + r"^genome coordinate end$", + r"^strand$", + r"^marker symbol$", + r"^status$", + r"^marker name$", + r"^marker type$", + r"^feature type$", + r"^marker synonyms \(pipe-separated\)$", + ] + if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header): + raise ValueError(f"Malformed or unexpected header in MGI_desc file {xref_file}") + + xref_count, syn_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi, verbose) + + result_message = f"{xref_count} MGI Description Xrefs added\n{syn_count} synonyms added" + return 0, result_message + + def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection, verbose: bool) -> Tuple[int, int]: + xref_count = 0 + syn_count = 0 + + for line in csv_reader: + if not line: + continue + + if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS: + raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns") + + accession = line[0] + label = line[6] + marker = line[8] + synonym_field = line[11] + + xref_id = self.add_xref( + { + "accession": accession, + "label": label, + "description": marker, + "source_id": source_id, + "species_id": species_id, + "info_type": "MISC", + }, + xref_dbi, + ) + + if not marker and verbose: + logging.info(f"{accession} has no description") + + xref_count += 1 + + if synonym_field: + synonyms = self.SYNONYM_SPLITTER.split(synonym_field) + for synonym in synonyms: + self.add_synonym(xref_id, synonym, xref_dbi) + syn_count += 1 + + return xref_count, syn_count diff --git a/src/python/ensembl/production/xrefs/parsers/MGIParser.py b/src/python/ensembl/production/xrefs/parsers/MGIParser.py index 2508d516a..4d95de9e8 100644 --- a/src/python/ensembl/production/xrefs/parsers/MGIParser.py +++ b/src/python/ensembl/production/xrefs/parsers/MGIParser.py @@ -14,59 +14,68 @@ """Parser module for MGI source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import csv +from typing import Dict, Any, Tuple, List +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class MGIParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") - syn_hash = self.get_ext_synonyms("MGI", xref_dbi) + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError("MGI file is empty") + file_io.seek(0) - file_io = self.get_filehandle(file) - csv_reader = csv.reader(file_io, delimiter="\t", strict=True) + csv_reader = csv.reader(file_io, delimiter="\t", strict=True) + syn_hash = self.get_ext_synonyms("MGI", xref_dbi) + count, syn_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi, syn_hash) + + result_message = f"{count} direct MGI xrefs added\n{syn_count} synonyms added" + return 0, result_message + + def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection, syn_hash: Dict[str, List[str]]) -> Tuple[int, int]: count = 0 syn_count = 0 - # Read lines for line in csv_reader: if not line: continue accession = line[0] + label = line[1] + description = line[2] ensembl_id = line[5] xref_id = self.add_xref( { "accession": accession, "version": 0, - "label": line[1], - "description": line[2], + "label": label, + "description": description, "source_id": source_id, "species_id": species_id, "info_type": "DIRECT", }, xref_dbi, ) - self.add_direct_xref(xref_id, ensembl_id, "Gene", "", xref_dbi) + self.add_direct_xref(xref_id, ensembl_id, "gene", "", xref_dbi) - if syn_hash.get(accession): - for synonym in syn_hash[accession]: + synonyms = syn_hash.get(accession) + if synonyms: + for synonym in synonyms: self.add_synonym(xref_id, synonym, xref_dbi) syn_count += 1 count += 1 - file_io.close() - - result_message = f"{count} direct MGI xrefs added\n" - result_message += f"{syn_count} synonyms added" - - return 0, result_message + return count, syn_count diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py deleted file mode 100644 index ae1fbb3dd..000000000 --- a/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py +++ /dev/null @@ -1,107 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Parser module for MGI CCDS source.""" - -from ensembl.production.xrefs.parsers.BaseParser import * - - -class MGI_CCDS_Parser(BaseParser): - def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") - - source_ids = [] - labels = {} - versions = {} - descriptions = {} - accessions = {} - - query = select(SourceUORM.source_id).filter(SourceUORM.name.like("MGI")) - result = xref_dbi.execute(query).fetchall() - for row in result: - source_ids.append(row[0]) - - query = select( - XrefUORM.accession, XrefUORM.label, XrefUORM.version, XrefUORM.description - ).filter(XrefUORM.source_id.in_(source_ids)) - - for row in xref_dbi.execute(query).mappings().all(): - if row["description"]: - accessions[row["label"]] = row.accession - labels[row["accession"]] = row.label - versions[row["accession"]] = row.version - descriptions[row["accession"]] = row.description - - # Get master xref ids via the ccds label - ccds_label_to_xref_id = {} - query = select(XrefUORM.label, XrefUORM.xref_id).where( - XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name == "CCDS" - ) - result = xref_dbi.execute(query).fetchall() - for row in result: - ccds_label_to_xref_id[row[0]] = row[1] - - count = 0 - ccds_missing = 0 - mgi_missing = 0 - - mgi_io = self.get_filehandle(file) - for line in mgi_io: - line = line.rstrip() - if not line: - continue - - fields = line.split("\t") - chromosome = fields[0] - g_accession = fields[1] - gene_name = fields[2] - entrez_id = fields[3] - ccds = fields[4] - - if ccds_label_to_xref_id.get(ccds): - if accessions.get(gene_name) and labels.get(accessions[gene_name]): - accession = accessions[gene_name] - self.add_dependent_xref( - { - "master_xref_id": ccds_label_to_xref_id[ccds], - "accession": accession, - "version": versions[accession], - "label": labels[accession], - "description": descriptions[accession], - "source_id": source_id, - "species_id": species_id, - }, - xref_dbi, - ) - - count += 1 - else: - mgi_missing += 1 - else: - ccds_missing += 1 - - mgi_io.close() - - result_message = f"Added {count} MGI xrefs via CCDS\n" - result_message += ( - f"{ccds_missing} CCDS not resolved, {mgi_missing} MGI not found" - ) - - return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py deleted file mode 100644 index 010298200..000000000 --- a/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py +++ /dev/null @@ -1,101 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Parser module for MGI Descriptions.""" - -from ensembl.production.xrefs.parsers.BaseParser import * - -EXPECTED_NUMBER_OF_COLUMNS = 12 - - -class MGI_Desc_Parser(BaseParser): - def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") - - file_io = self.get_filehandle(file) - csv_reader = csv.reader( - file_io, delimiter="\t", strict=True, quotechar=None, escapechar=None - ) - - # Check if header is valid - header = next(csv_reader) - patterns = [ - "mgi accession id", - "chr", - "cm position", - "genome coordinate start", - "genome coordinate end", - "strand", - "marker symbol", - "status", - "marker name", - "marker type", - "feature type", - r"marker\ssynonyms\s\(pipe\-separated\)", - ] - if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header): - raise IOError(f"Malformed or unexpected header in MGI_desc file {file}") - - xref_count = 0 - syn_count = 0 - acc_to_xref = {} - - # Read lines - for line in csv_reader: - if not line: - continue - - accession = line[0] - marker = line[8] - - xref_id = self.add_xref( - { - "accession": accession, - "label": line[6], - "description": marker, - "source_id": source_id, - "species_id": species_id, - "info_type": "MISC", - }, - xref_dbi, - ) - acc_to_xref[accession] = xref_id - - if not marker and verbose: - logging.info(f"{accession} has no description") - - xref_count += 1 - - if acc_to_xref.get(accession): - synonym_field = line[11] - if synonym_field: - synonyms = re.split(r"[|]", synonym_field) - - for synonym in synonyms: - self.add_synonym(xref_id, synonym, xref_dbi) - syn_count += 1 - - file_io.close() - - result_message = f"{xref_count} MGI Description Xrefs added\n" - result_message += f"{syn_count} synonyms added" - - return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/MIMParser.py b/src/python/ensembl/production/xrefs/parsers/MIMParser.py index 1ae4f5952..829c5de21 100644 --- a/src/python/ensembl/production/xrefs/parsers/MIMParser.py +++ b/src/python/ensembl/production/xrefs/parsers/MIMParser.py @@ -14,29 +14,30 @@ """Parser module for MIM source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import re +import logging +from typing import Any, Dict, List, Optional, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class MIMParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - general_source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not general_source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") - + general_source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not general_source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + old_to_new, removed = {}, {} - sources = [] - - sources.append(general_source_id) + sources = [general_source_id] gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi) - sources.append(gene_source_id) morbid_source_id = self.get_source_id_for_source_name("MIM_MORBID", xref_dbi) - sources.append(morbid_source_id) + sources.extend([gene_source_id, morbid_source_id]) TYPE_SINGLE_SOURCES = { "*": gene_source_id, @@ -48,30 +49,24 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: counters = {gene_source_id: 0, morbid_source_id: 0, "removed": 0, "synonyms": 0} if verbose: - logging.info("Sources are: " + ", ".join(map(str, sources))) - - for section in self.get_file_sections(file, "*RECORD*"): - if len(section) == 1: - continue + logging.info(f"Sources are: {', '.join(map(str, sources))}") + for section in self.get_file_sections(xref_file, "*RECORD*"): record = "".join(section) - # Extract the TI field + # Extract the TI field from the record ti = self.extract_ti(record) if not ti: - raise IOError("Failed to extract TI field from record") + raise ValueError("Failed to extract TI field from record") - # Extract record type - (record_type, number, long_desc) = self.parse_ti(ti) + # Extract record type, number, and description from the TI field + record_type, number, long_desc = self.parse_ti(ti) if record_type is None: - raise IOError( - "Failed to extract record type and description from TI field" - ) + raise ValueError("Failed to extract record type and description from TI field") # Use the first block of text as description fields = re.split(";;", long_desc, flags=re.MULTILINE | re.DOTALL) - label = fields[0] - label = f"{label} [{record_type}{number}]" + label = f"{fields[0]} [{record_type}{number}]" xref_object = { "accession": number, @@ -81,79 +76,72 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: "info_type": "UNMAPPED", } - if TYPE_SINGLE_SOURCES.get(record_type): + if record_type in TYPE_SINGLE_SOURCES: type_source = TYPE_SINGLE_SOURCES[record_type] xref_object["source_id"] = type_source counters[type_source] += 1 - - xref_id = self.add_xref(xref_object, xref_dbi) + self.add_xref(xref_object, xref_dbi) elif record_type == "+": # This type means both gene and phenotype, add both xref_object["source_id"] = gene_source_id counters[gene_source_id] += 1 - xref_id = self.add_xref(xref_object, xref_dbi) + self.add_xref(xref_object, xref_dbi) xref_object["source_id"] = morbid_source_id counters[morbid_source_id] += 1 - xref_id = self.add_xref(xref_object, xref_dbi) + self.add_xref(xref_object, xref_dbi) elif record_type == "^": - match = re.search( - r"MOVED\sTO\s(\d+)", long_desc, flags=re.MULTILINE | re.DOTALL - ) - if match: - new_number = match.group(1) - if new_number != number: - old_to_new[number] = new_number - elif long_desc == "REMOVED FROM DATABASE": - removed[number] = 1 - counters["removed"] += 1 - else: - raise IOError(f"Unsupported type of a '^' record: '{long_desc}'") - - # Generate synonyms from "MOVED TO" entries - for old, new in old_to_new.items(): - # Some entries in the MIM database have been moved multiple times - # Keep traversing the chain of renames until we have reached the end - while old_to_new.get(new): - new = old_to_new[new] + self.handle_moved_or_removed_record(number, long_desc, old_to_new, removed, counters) - # Check if the entry has been removed from the database - if not removed.get(new): - self.add_to_syn_for_mult_sources( - new, sources, old, species_id, xref_dbi - ) - counters["synonyms"] += 1 + self.generate_synonyms_from_moved_entries(old_to_new, removed, sources, species_id, xref_dbi, counters) - result_message = "%d genemap and %d phenotype MIM xrefs added\n" % ( - counters[gene_source_id], - counters[morbid_source_id], + result_message = ( + f"{counters[gene_source_id]} genemap and {counters[morbid_source_id]} phenotype MIM xrefs added\n" + f"\t{counters['synonyms']} synonyms (defined by MOVED TO) added\n" + f"\t{counters['removed']} entries removed" ) - result_message += ( - "\t%d synonyms (defined by MOVED TO) added\n" % counters["synonyms"] - ) - result_message += "\t%d entries removed" % counters["removed"] return 0, result_message - def extract_ti(self, input_record: str) -> str: - ti = None - + def extract_ti(self, input_record: str) -> Optional[str]: + """Extract the TI field from the input record.""" match = re.search( r"[*]FIELD[*]\sTI\n(.+?)\n?(?:[*]FIELD[*]| [*]RECORD[*]| [*]THEEND[*])", input_record, flags=re.MULTILINE | re.DOTALL, ) - if match: - ti = match.group(1) - - return ti + return match.group(1) if match else None def parse_ti(self, ti: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: + """Parse the TI field to extract record type, number, and description.""" ti = re.sub(r"(?:;;\n|\n;;)", ";;", ti, flags=re.MULTILINE | re.DOTALL) ti = re.sub(r"\n", "", ti, flags=re.MULTILINE | re.DOTALL) match = re.search(r"\A([#%+*^]*)(\d+)\s+(.+)", ti) + return match.groups() if match else (None, None, None) + + def handle_moved_or_removed_record(self, number: str, long_desc: str, old_to_new: Dict[str, str], removed: Dict[str, int], counters: Dict[str, int]) -> None: + """Handle records that have been moved or removed.""" + match = re.search(r"MOVED\sTO\s(\d+)", long_desc, flags=re.MULTILINE | re.DOTALL) if match: - return match.group(1), match.group(2), match.group(3) + new_number = match.group(1) + if new_number != number: + old_to_new[number] = new_number + elif long_desc == "REMOVED FROM DATABASE": + removed[number] = 1 + counters["removed"] += 1 + else: + raise IOError(f"Unsupported type of a '^' record: '{long_desc}'") + + def generate_synonyms_from_moved_entries(self, old_to_new: Dict[str, str], removed: Dict[str, int], sources: List[int], species_id: int, xref_dbi: Connection, counters: Dict[str, int]) -> None: + """Generate synonyms from 'MOVED TO' entries.""" + for old, new in old_to_new.items(): + # Some entries in the MIM database have been moved multiple times + # Keep traversing the chain of renames until we have reached the end + while old_to_new.get(new): + new = old_to_new[new] - return None, None, None + # Check if the entry has been removed from the database + if not removed.get(new): + self.add_to_syn_for_mult_sources(new, sources, old, species_id, xref_dbi) + counters["synonyms"] += 1 diff --git a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py index 6c7688889..4a1654fc5 100644 --- a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py +++ b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py @@ -14,27 +14,57 @@ """Parser module for MIM to Gene source.""" -from ensembl.production.xrefs.parsers.BaseParser import * - -EXPECTED_NUMBER_OF_COLUMNS = 6 +import csv +import re +import logging +from typing import Any, Dict, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class Mim2GeneParser(BaseParser): + EXPECTED_NUMBER_OF_COLUMNS = 6 + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - general_source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) + general_source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not general_source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + counters = { + "all_entries": 0, + "dependent_on_entrez": 0, + "missed_master": 0, + "missed_omim": 0, + } + + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"Mim2Gene file is empty") + file_io.seek(0) + + csv_reader = csv.reader(file_io, delimiter="\t") - if not general_source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + self.process_lines(csv_reader, xref_file, species_id, counters, verbose, xref_dbi) + result_message = ( + f"Processed {counters['all_entries']} entries. Out of those\n" + f"\t{counters['missed_omim']} had missing OMIM entries,\n" + f"\t{counters['dependent_on_entrez']} were dependent EntrezGene xrefs,\n" + f"\t{counters['missed_master']} had missing master entries." + ) + # result_message = f"all={counters['all_entries']} -- missed_omim={counters['missed_omim']} -- dependent_on_entrez={counters['dependent_on_entrez']} -- missed_master={counters['missed_master']}" + + return 0, result_message + + def process_lines(self, csv_reader: csv.reader, xref_file:str, species_id: int, counters: Dict[str, int], verbose: bool, xref_dbi: Connection) -> None: # Get needed source IDs mim_gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi) - mim_morbid_source_id = self.get_source_id_for_source_name( - "MIM_MORBID", xref_dbi - ) + mim_morbid_source_id = self.get_source_id_for_source_name("MIM_MORBID", xref_dbi) entrez_source_id = self.get_source_id_for_source_name("EntrezGene", xref_dbi) # This will be used to prevent insertion of duplicates @@ -45,16 +75,6 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: mim_morbid = self.get_valid_codes("MIM_MORBID", species_id, xref_dbi) entrez = self.get_valid_codes("EntrezGene", species_id, xref_dbi) - counters = { - "all_entries": 0, - "dependent_on_entrez": 0, - "missed_master": 0, - "missed_omim": 0, - } - - file_io = self.get_filehandle(file) - csv_reader = csv.reader(file_io, delimiter="\t") - # Read lines for line in csv_reader: if not line: @@ -67,34 +87,21 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: if is_comment: patterns = [ r"\A[#]?\s*MIM[ ]number", - "GeneID", - "type", - "Source", - "MedGenCUI", - "Comment", + r"^GeneID$", + r"^type$", + r"^Source$", + r"^MedGenCUI$", + r"^Comment$", ] - if len( - line - ) == EXPECTED_NUMBER_OF_COLUMNS and not self.is_file_header_valid( - EXPECTED_NUMBER_OF_COLUMNS, patterns, line, True - ): - raise IOError( - f"Malformed or unexpected header in Mim2Gene file {file}" - ) + if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, line, True): + raise ValueError(f"Malformed or unexpected header in Mim2Gene file {xref_file}") continue - if len(line) != EXPECTED_NUMBER_OF_COLUMNS: - raise IOError( - f"Line {csv_reader.line_num} of input file {file} has an incorrect number of columns" - ) + if len(line) != self.EXPECTED_NUMBER_OF_COLUMNS: + raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns") fields = [re.sub(r"\s+\Z", "", x) for x in line] - omim_acc = fields[0] - entrez_id = fields[1] - type = fields[2] - source = fields[3] - medgen = fields[4] - comment = fields[5] + omim_acc, entrez_id, type = fields[:3] counters["all_entries"] += 1 @@ -109,15 +116,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: continue # Check if type is known - if verbose and type not in [ - "gene", - "gene/phenotype", - "predominantly phenotypes", - "phenotype", - ]: - logging.warn( - f"Unknown type {type} for MIM Number {omim_acc} ({file}:{csv_reader.line_num})" - ) + if verbose and type not in ["gene", "gene/phenotype", "predominantly phenotypes", "phenotype"]: + logging.warning(f"Unknown type {type} for MIM Number {omim_acc} ({xref_file}:{csv_reader.line_num})") # With all the checks taken care of, insert the mappings. We check # both MIM_GENE and MIM_MORBID every time because some MIM entries @@ -145,26 +145,11 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: xref_dbi, ) - file_io.close() - - result_message = ( - "Processed %d entries. Out of those\n" % counters["all_entries"] - ) - result_message += "\t%d had missing OMIM entries,\n" % counters["missed_omim"] - result_message += ( - "\t%d were dependent EntrezGene xrefs,\n" % counters["dependent_on_entrez"] - ) - result_message += "\t%d had missing master entries." % counters["missed_master"] - - return 0, result_message - def process_xref_entry(self, args: Dict[str, Any], dbi: Connection) -> int: count = 0 - for ent_id in args["entrez_xrefs"]: self.add_dependent_xref_maponly( args["mim_xref_id"], args["mim_source_id"], ent_id, None, dbi, True ) count += 1 - return count diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py index c7d4990eb..e760cbf9e 100644 --- a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py @@ -14,33 +14,50 @@ """Parser module for RFAM source.""" -from ensembl.production.xrefs.parsers.BaseParser import * - +import logging +import os +import re +import wget +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urlparse +from sqlalchemy import and_, select +from sqlalchemy.engine import Connection +from sqlalchemy.engine.url import URL + +from ensembl.core.models import ( + Analysis as AnalysisORM, + Transcript as TranscriptORM, + ExonTranscript as ExonTranscriptORM, + SupportingFeature as SupportingFeatureORM, + DnaAlignFeature as DnaAlignFeatureORM, +) + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class RFAMParser(BaseParser): + ACCESSION_PATTERN = re.compile(r"^#=GF\sAC\s+(\w+)", re.MULTILINE) + LABEL_PATTERN = re.compile(r"^#=GF\sID\s+([^\n]+)", re.MULTILINE) + DESCRIPTION_PATTERN = re.compile(r"^#=GF\sDE\s+([^\n]+)", re.MULTILINE) + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - species_name = args["species_name"] - file = args["file"] - dba = args["dba"] - ensembl_release = args["ensembl_release"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + source_id = args.get("source_id") + species_id = args.get("species_id") + species_name = args.get("species_name") + xref_file = args.get("file") + dba = args.get("dba") + ensembl_release = args.get("ensembl_release") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") # Extract db connection parameters from file - wget_url, db_user, db_host, db_port, db_name, db_pass = ( - self.extract_params_from_string( - file, ["wget", "user", "host", "port", "dbname", "pass"] - ) + wget_url, db_user, db_host, db_port, db_name, db_pass = self.extract_params_from_string( + xref_file, ["wget", "user", "host", "port", "dbname", "pass"] ) - if not db_user: - db_user = "ensro" - if not db_port: - db_port = "3306" + db_user = db_user or "ensro" + db_port = db_port or "3306" # Get the species name(s) species_id_to_names = self.species_id_to_names(xref_dbi) @@ -53,27 +70,34 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: species_name = species_id_to_names[species_id][0] # Connect to the appropriate rfam db + rfam_db_url = self.get_rfam_db_url(db_host, db_user, db_pass, db_port, db_name, dba, species_name, ensembl_release, verbose) + if not rfam_db_url: + raise AttributeError("Could not find RFAM DB.") + if verbose: + logging.info(f"Found RFAM DB: {rfam_db_url}") + + # Download file through wget if url present + if wget_url: + xref_file = self.download_file(wget_url, xref_file) + + # Add xrefs + xref_count, direct_count = self.process_lines(xref_file, rfam_db_url, source_id, species_id, xref_dbi) + + result_message = f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs" + return 0, result_message + + def get_rfam_db_url(self, db_host: str, db_user: str, db_pass: str, db_port: str, db_name: str, dba: str, species_name: str, ensembl_release: str, verbose: bool) -> Any: if db_host: - rfam_db_url = URL.create( - "mysql", db_user, db_pass, db_host, db_port, db_name - ) + return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name) elif dba: - rfam_db_url = dba + return dba else: if verbose: logging.info("Looking for db in mysql-ens-sta-1") registry = "ensro@mysql-ens-sta-1:4519" - rfam_db_url = self.get_db_from_registry( - species_name, "core", ensembl_release, registry - ) + return self.get_db_from_registry(species_name, "core", ensembl_release, registry) - if not rfam_db_url: - raise IOError(f"Could not find RFAM DB.") - else: - if verbose: - logging.info(f"Found RFAM DB: {rfam_db_url}") - - # Get data from rfam db + def get_rfam_transcript_stable_ids(self, rfam_db_url: Any) -> Dict[str, List[str]]: db_engine = self.get_db_engine(rfam_db_url) with db_engine.connect() as rfam_dbi: query = ( @@ -103,8 +127,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: ) .join( DnaAlignFeatureORM, - DnaAlignFeatureORM.dna_align_feature_id - == SupportingFeatureORM.feature_id, + DnaAlignFeatureORM.dna_align_feature_id == SupportingFeatureORM.feature_id, ) .order_by(DnaAlignFeatureORM.hit_name) ) @@ -113,81 +136,61 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: # Create a dict with RFAM accessions as keys and value is an array of ensembl transcript stable_ids rfam_transcript_stable_ids = {} for row in result: - rfam_id = None - match = re.search(r"^(RF\d+)", row.hit_name) if match: rfam_id = match.group(1) - - if rfam_id: rfam_transcript_stable_ids.setdefault(rfam_id, []).append(row.stable_id) - # Download file through wget if url present - if wget_url: - uri = urlparse(wget_url) - file = os.path.join(os.path.dirname(file), os.path.basename(uri.path)) - wget.download(wget_url, file) - - # Read data from file - lines = [] - entry = "" - - file_io = gzip.open(file, "r") - for line in file_io: - line = line.decode("latin-1") - if re.search(r"^//", line): - lines.append(entry) - entry = "" - elif ( - re.search(r"^#=GF\sAC", line) - or re.search(r"^#=GF\sID", line) - or re.search(r"^#=GF\sDE", line) - ): - entry += line - file_io.close() + return rfam_transcript_stable_ids - # Add xrefs + def download_file(self, wget_url: str, rfam_file: str) -> str: + uri = urlparse(wget_url) + rfam_file = os.path.join(os.path.dirname(rfam_file), os.path.basename(uri.path)) + wget.download(wget_url, rfam_file) + + return rfam_file + + def process_lines(self, xref_file: str, rfam_db_url: Any, source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int]: xref_count, direct_count = 0, 0 - for entry in lines: - accession, label, description = None, None, None + # Get data from rfam db + rfam_transcript_stable_ids = self.get_rfam_transcript_stable_ids(rfam_db_url) + + for section in self.get_file_sections(xref_file, "//\n", "utf-8"): + entry = "".join(section) # Extract data from entry - match = re.search(r"^#=GF\sAC\s+(\w+)", entry, flags=re.MULTILINE) - if match: - accession = match.group(1) - match = re.search(r"^#=GF\sID\s+([^\n]+)", entry, flags=re.MULTILINE) - if match: - label = match.group(1) - match = re.search(r"^#=GF\sDE\s+([^\n]+)", entry, flags=re.MULTILINE) - if match: - description = match.group(1) - - if accession: - if rfam_transcript_stable_ids.get(accession): - xref_id = self.add_xref( - { - "accession": accession, - "version": 0, - "label": label or accession, - "description": description, - "source_id": source_id, - "species_id": species_id, - "info_type": "DIRECT", - }, - xref_dbi, - ) - xref_count += 1 - - transcript_stable_ids = rfam_transcript_stable_ids[accession] - for stable_id in transcript_stable_ids: - self.add_direct_xref( - xref_id, stable_id, "Transcript", "", xref_dbi - ) - direct_count += 1 - - result_message = ( - f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs" - ) + accession, label, description = self.extract_entry_data(entry) + + if accession and rfam_transcript_stable_ids.get(accession): + print("accession in dict") + xref_id = self.add_xref( + { + "accession": accession, + "version": 0, + "label": label or accession, + "description": description, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + xref_count += 1 - return 0, result_message + for stable_id in rfam_transcript_stable_ids[accession]: + self.add_direct_xref(xref_id, stable_id, "Transcript", "", xref_dbi) + direct_count += 1 + + return xref_count, direct_count + + def extract_entry_data(self, entry: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: + accession = self.extract_pattern(entry, self.ACCESSION_PATTERN) + label = self.extract_pattern(entry, self.LABEL_PATTERN) + description = self.extract_pattern(entry, self.DESCRIPTION_PATTERN) + + return accession, label, description + + def extract_pattern(self, text: str, pattern: re.Pattern) -> Optional[str]: + match = pattern.search(text) + return match.group(1) if match else None diff --git a/src/python/ensembl/production/xrefs/parsers/RGDParser.py b/src/python/ensembl/production/xrefs/parsers/RGDParser.py index 11ddd0e0e..54b574e82 100644 --- a/src/python/ensembl/production/xrefs/parsers/RGDParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RGDParser.py @@ -14,57 +14,65 @@ """Parser module for RGD source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import csv +import re +from typing import Any, Dict, List, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class RGDParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") - direct_source_id = self.get_source_id_for_source_name( - "RGD", xref_dbi, "direct_xref" - ) + direct_source_id = self.get_source_id_for_source_name("RGD", xref_dbi, "direct_xref") - # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs - preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"RGD file is empty") + file_io.seek(0) - rgd_io = self.get_filehandle(file) - csv_reader = csv.DictReader( - filter(lambda row: row[0] != "#", rgd_io), delimiter="\t" + csv_reader = csv.DictReader(filter(lambda row: row[0] != "#", file_io), delimiter="\t") + + dependent_count, ensembl_count, mismatch_count, syn_count = self.process_lines(csv_reader, source_id, direct_source_id, species_id, xref_dbi) + + result_message = ( + f"{dependent_count} xrefs successfully loaded and dependent on refseq\n" + f"\t{mismatch_count} xrefs added but with NO dependencies\n" + f"\t{ensembl_count} direct xrefs successfully loaded\n" + f"\tAdded {syn_count} synonyms, including duplicates" ) - header_found, count, ensembl_count, mismatch, syn_count = 0, 0, 0, 0, 0 - columns = {} + return 0, result_message + + def process_lines(self, csv_reader: csv.DictReader, source_id: int, direct_source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int, int, int]: + dependent_count, ensembl_count, mismatch_count, syn_count = 0, 0, 0, 0 - # Read lines - for line in csv_reader: - # Don't bother doing anything if we don't have an RGD ID - if not line.get("GENE_RGD_ID") or not line["GENE_RGD_ID"]: - continue + # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs + preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi) - # Some RGD annotation is directly copied from Ensembl - if re.search("ENSRNO", line["SYMBOL"]): + for line in csv_reader: + # Don't bother doing anything if we don't have an RGD ID or if the symbol is an Ensembl ID + if not line["GENE_RGD_ID"] or re.search("ENSRNO", line["SYMBOL"]): continue - genbank_nucleotides = [] - if line.get("GENBANK_NUCLEOTIDE"): - genbank_nucleotides = line["GENBANK_NUCLEOTIDE"].split(";") + genbank_nucleotides = line["GENBANK_NUCLEOTIDE"].split(";") + done = False - done = 0 # The nucleotides are sorted in the file in alphabetical order. Filter them down # to a higher quality subset, then add dependent Xrefs where possible for nucleotide in self.sort_refseq_accessions(genbank_nucleotides): - if not done and preloaded_refseq.get(nucleotide): - for xref in preloaded_refseq[nucleotide]: + if not done and nucleotide in preloaded_refseq: + for master_xref_id in preloaded_refseq[nucleotide]: xref_id = self.add_dependent_xref( { - "master_xref_id": xref, + "master_xref_id": master_xref_id, "accession": line["GENE_RGD_ID"], "label": line["SYMBOL"], "description": line["NAME"], @@ -73,43 +81,35 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: }, xref_dbi, ) + dependent_count += 1 - count += 1 - syn_count += self.process_synonyms( - xref_id, line["OLD_SYMBOL"], xref_dbi - ) - done = 1 + syn_count += self.process_synonyms(xref_id, line["OLD_SYMBOL"], xref_dbi) + done = True # Add direct xrefs - if line.get("ENSEMBL_ID"): + if line["ENSEMBL_ID"]: ensembl_ids = line["ENSEMBL_ID"].split(";") - - for id in ensembl_ids: - self.add_to_direct_xrefs( + for ensembl_id in ensembl_ids: + xref_id = self.add_xref( { - "stable_id": id, - "ensembl_type": "gene", "accession": line["GENE_RGD_ID"], "label": line["SYMBOL"], "description": line["NAME"], "source_id": direct_source_id, "species_id": species_id, + "info_type": "DIRECT", }, xref_dbi, ) - xref_id = self.get_xref_id( - line["GENE_RGD_ID"], direct_source_id, species_id, xref_dbi - ) - + self.add_direct_xref(xref_id, ensembl_id, "gene", "", xref_dbi) ensembl_count += 1 - syn_count += self.process_synonyms( - xref_id, line["OLD_SYMBOL"], xref_dbi - ) - done = 1 + + syn_count += self.process_synonyms(xref_id, line["OLD_SYMBOL"], xref_dbi) + done = True # If neither direct or dependent, add misc xref if not done: - xref_id = self.add_xref( + self.add_xref( { "accession": line["GENE_RGD_ID"], "label": line["SYMBOL"], @@ -120,35 +120,23 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: }, xref_dbi, ) - mismatch += 1 - - rgd_io.close() - - result_message = f"{count} xrefs succesfully loaded and dependent on refseq\n" - result_message += f"\t{mismatch} xrefs added but with NO dependencies\n" - result_message += f"\t{ensembl_count} direct xrefs successfully loaded\n" - result_message += f"\tAdded {syn_count} synonyms, including duplicates" - - return 0, result_message + mismatch_count += 1 + + return dependent_count, ensembl_count, mismatch_count, syn_count def sort_refseq_accessions(self, accessions: List[str]) -> List[str]: refseq_priorities = {"NM": 1, "NP": 1, "NR": 1, "XM": 2, "XP": 2, "XR": 2} - - accessions = sorted( - [x for x in accessions if x[:2] in refseq_priorities], - key=lambda x: (refseq_priorities[x[:2]], x), + return sorted( + (acc for acc in accessions if acc[:2] in refseq_priorities), + key=lambda acc: (refseq_priorities[acc[:2]], acc), ) - return accessions def process_synonyms(self, xref_id: int, synonym_string: str, dbi: Connection) -> int: - syn_count = 0 - if not synonym_string or not xref_id: - return syn_count + return 0 synonyms = synonym_string.split(";") for synonym in synonyms: self.add_synonym(xref_id, synonym, dbi) - syn_count += 1 - return syn_count + return len(synonyms) diff --git a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py index 4ae9b46d8..c00df2cc4 100644 --- a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py @@ -14,38 +14,31 @@ """Parser module for Reactome source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import logging +import re +from typing import Any, Dict, Optional, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class ReactomeParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - species_name = args["species_name"] - file = args["file"] - release_file = args["rel_file"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) + source_id = args.get("source_id") + species_id = args.get("species_id") + species_name = args.get("species_name") + xref_file = args.get("file") + release_file = args.get("rel_file") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") # Parse release file if release_file: - release = None - - release_io = self.get_filehandle(release_file) - for line in release_io: - match = re.search(r"([0-9]*)", line) - if match: - release = match.group(1) - if verbose: - logging.info(f"Reactome release is '{release}'") - release_io.close() - + release = self.parse_release_file(release_file, verbose) if not release: - raise IOError(f"Could not find release using {release_file}") - + raise ValueError(f"Could not find release using {release_file}") self.set_release(source_id, release, xref_dbi) # Create a hash of all valid names for this species @@ -60,130 +53,122 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: alias_to_species_id = {alias: 1 for alias in aliases} # Get relevant source ids - reactome_source_id = self.get_source_id_for_source_name( - "reactome", xref_dbi, "direct" - ) - transcript_reactome_source_id = self.get_source_id_for_source_name( - "reactome_transcript", xref_dbi - ) - gene_reactome_source_id = self.get_source_id_for_source_name( - "reactome_gene", xref_dbi - ) - reactome_uniprot_source_id = self.get_source_id_for_source_name( - "reactome", xref_dbi, "uniprot" + source_ids = self.get_source_ids(xref_dbi, verbose) + + parsed_count, dependent_count, direct_count, error_count = self.process_file(xref_file, alias_to_species_id, source_ids, species_id, xref_dbi, verbose) + + result_message = ( + f"{parsed_count} Reactome entries processed\n" + f"\t{dependent_count} dependent xrefs added\n" + f"\t{direct_count} direct xrefs added\n" + f"\t{error_count} not found" ) + return 0, result_message - # Cannot continue unless source ids are found - if ( - not reactome_source_id - or not transcript_reactome_source_id - or not gene_reactome_source_id - ): - raise KeyError("Could not find source id for reactome sources") - else: - if verbose: - logging.info(f"Source_id = {reactome_source_id}") - logging.info(f"Transcript_source_id = {transcript_reactome_source_id}") - logging.info(f"Gene_source_id = {gene_reactome_source_id}") - - if not reactome_uniprot_source_id: - raise KeyError("Could not find source id for reactome uniprot") - else: - if verbose: - logging.info(f"Uniprot_source_id = {reactome_uniprot_source_id}") - - # Get uniprot accessions - is_uniprot = 0 - uniprot_accessions = {} - if re.search("UniProt", file): - is_uniprot = 1 - uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi) - - parsed_count, err_count = 0, 0 - - # Read file - reactome_io = self.get_filehandle(file) - - for line in reactome_io: - line = line.strip() - - (ensembl_stable_id, reactome_id, url, description, evidence, species) = ( - re.split(r"\t+", line) - ) - - # Check description pattern - match = re.search( - r"^[A-Za-z0-9_,\(\)\/\-\.:\+'&;\"\/\?%>\s\[\]]+$", description - ) - if not match: - continue - - species = re.sub(r"\s", "_", species) - species = species.lower() - - current_source_id = reactome_source_id - - if alias_to_species_id.get(species): - parsed_count += 1 - - ensembl_type = None - info_type = "DIRECT" - - # Add uniprot dependent xrefs - if is_uniprot: - if uniprot_accessions.get(ensembl_stable_id): - for xref in uniprot_accessions[ensembl_stable_id]: - xref_id = self.add_dependent_xref( - { - "master_xref_id": xref, - "accession": reactome_id, - "label": reactome_id, - "description": description, - "source_id": reactome_uniprot_source_id, - "species_id": species_id, - }, - xref_dbi, - ) - info_type = "DEPENDENT" - - # Attempt to guess the object_type based on the stable id - elif re.search(r"G[0-9]*$", ensembl_stable_id): - ensembl_type = "gene" - current_source_id = gene_reactome_source_id - elif re.search(r"T[0-9]*$", ensembl_stable_id): - ensembl_type = "transcript" - current_source_id = transcript_reactome_source_id - elif re.search(r"P[0-9]*$", ensembl_stable_id): - ensembl_type = "translation" - - # Is not in Uniprot and does not match Ensembl stable id format - else: + def parse_release_file(self, release_file: str, verbose: bool) -> Optional[str]: + release = None + with self.get_filehandle(release_file) as release_io: + for line in release_io: + match = re.search(r"([0-9]*)", line) + if match: + release = match.group(1) if verbose: - logging.debug(f"Could not find type for {ensembl_stable_id}") - err_count += 1 + logging.info(f"Reactome release is '{release}'") + return release + + def get_source_ids(self, xref_dbi: Connection, verbose: bool) -> Tuple[int, int, int, int]: + reactome_source_id = self.get_source_id_for_source_name("reactome", xref_dbi, "direct") + transcript_reactome_source_id = self.get_source_id_for_source_name("reactome_transcript", xref_dbi) + gene_reactome_source_id = self.get_source_id_for_source_name("reactome_gene", xref_dbi) + reactome_uniprot_source_id = self.get_source_id_for_source_name("reactome", xref_dbi, "uniprot") + + if verbose: + logging.info(f"Source_id = {reactome_source_id}") + logging.info(f"Transcript_source_id = {transcript_reactome_source_id}") + logging.info(f"Gene_source_id = {gene_reactome_source_id}") + logging.info(f"Uniprot_source_id = {reactome_uniprot_source_id}") + + return { + "reactome_source_id": reactome_source_id, + "transcript_reactome_source_id": transcript_reactome_source_id, + "gene_reactome_source_id": gene_reactome_source_id, + "reactome_uniprot_source_id": reactome_uniprot_source_id + } + + def process_file(self, xref_file: str, alias_to_species_id: Dict[str, int], source_ids: Dict[str, int], species_id: int, xref_dbi: Connection, verbose: bool) -> Tuple[int, int, int, int]: + parsed_count, dep_count, direct_count, err_count = 0, 0, 0, 0 + + # Get existing uniprot accessions + is_uniprot = bool(re.search("UniProt", xref_file)) + uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi) if is_uniprot else {} + + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"Reactome file is empty") + file_io.seek(0) + + for line in file_io: + line = line.strip() + ensembl_stable_id, reactome_id, url, description, evidence, species = re.split(r"\t+", line) + + # Check description pattern + if not re.match(r"^[A-Za-z0-9_,\(\)\/\-\.:\+'&;\"\/\?%>\s\[\]]+$", description): continue - # Add new entry for reactome xref as well as direct xref to ensembl stable id - xref_id = self.add_xref( - { - "accession": reactome_id, - "label": reactome_id, - "description": description, - "source_id": current_source_id, - "species_id": species_id, - "info_type": info_type, - }, - xref_dbi, - ) - - if ensembl_type: - self.add_direct_xref( - xref_id, ensembl_stable_id, ensembl_type, "", xref_dbi - ) - - reactome_io.close() - - result_message = f"{parsed_count} entries processed\n" - result_message += f"{err_count} not found" - - return 0, result_message + species = re.sub(r"\s", "_", species).lower() + + # Continue only for current species + if alias_to_species_id.get(species): + parsed_count += 1 + + ensembl_type = None + info_type = "DIRECT" + current_source_id = source_ids["reactome_source_id"] + + if is_uniprot: + if uniprot_accessions.get(ensembl_stable_id): # Add uniprot dependent xrefs + for xref in uniprot_accessions[ensembl_stable_id]: + self.add_dependent_xref( + { + "master_xref_id": xref, + "accession": reactome_id, + "label": reactome_id, + "description": description, + "source_id": source_ids["reactome_uniprot_source_id"], + "species_id": species_id, + "info_type": "DEPENDENT", + }, + xref_dbi, + ) + dep_count += 1 + elif re.search(r"G[0-9]*$", ensembl_stable_id): # Attempt to guess the object_type based on the stable id + ensembl_type = "gene" + current_source_id = source_ids["gene_reactome_source_id"] + elif re.search(r"T[0-9]*$", ensembl_stable_id): + ensembl_type = "transcript" + current_source_id = source_ids["transcript_reactome_source_id"] + elif re.search(r"P[0-9]*$", ensembl_stable_id): + ensembl_type = "translation" + else: # Is not in Uniprot and does not match Ensembl stable id format + if verbose: + logging.debug(f"Could not find type for {ensembl_stable_id}") + err_count += 1 + continue + + # Add new entry for reactome xref as well as direct xref to ensembl stable id + if ensembl_type: + xref_id = self.add_xref( + { + "accession": reactome_id, + "label": reactome_id, + "description": description, + "source_id": current_source_id, + "species_id": species_id, + "info_type": info_type, + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, ensembl_stable_id, ensembl_type, "", xref_dbi) + direct_count += 1 + + return parsed_count, dep_count, direct_count, err_count diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py index 14f6f76dd..61662dcaf 100644 --- a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py @@ -14,60 +14,27 @@ """Parser module for RefSeq coordinate xrefs.""" -from ensembl.production.xrefs.parsers.BaseParser import * -from ensembl.common.RangeRegistry import RangeRegistry +import json +import logging +import subprocess +from typing import Any, Dict, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class RefSeqCoordinateParser(BaseParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - species_name = args["species_name"] - file = args["file"] - dba = args["dba"] - ensembl_release = args["ensembl_release"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + source_id = args.get("source_id") + species_id = args.get("species_id") + species_name = args.get("species_name") + dba = args.get("dba") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) - source_ids = { - "peptide": self.get_source_id_for_source_name( - "RefSeq_peptide", xref_dbi, "otherfeatures" - ), - "mrna": self.get_source_id_for_source_name( - "RefSeq_mRNA", xref_dbi, "otherfeatures" - ), - "ncrna": self.get_source_id_for_source_name( - "RefSeq_ncRNA", xref_dbi, "otherfeatures" - ), - "peptide_predicted": self.get_source_id_for_source_name( - "RefSeq_peptide_predicted", xref_dbi, "otherfeatures" - ), - "mrna_predicted": self.get_source_id_for_source_name( - "RefSeq_mRNA_predicted", xref_dbi, "otherfeatures" - ), - "ncrna_predicted": self.get_source_id_for_source_name( - "RefSeq_ncRNA_predicted", xref_dbi, "otherfeatures" - ), - "entrezgene": self.get_source_id_for_source_name("EntrezGene", xref_dbi), - "wikigene": self.get_source_id_for_source_name("WikiGene", xref_dbi), - } + if not source_id or not species_id: + raise AttributeError("Missing required arguments: source_id and species_id") - if verbose: - logging.info(f'RefSeq_peptide source ID = {source_ids["peptide"]}') - logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna"]}') - logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna"]}') - logging.info( - f'RefSeq_peptide_predicted source ID = {source_ids["peptide_predicted"]}' - ) - logging.info( - f'RefSeq_mRNA_predicted source ID = {source_ids["mrna_predicted"]}' - ) - logging.info( - f'RefSeq_ncRNA_predicted source ID = {source_ids["ncrna_predicted"]}' - ) + source_ids = self.get_source_ids(verbose, xref_dbi) # Get the species name(s) species_id_to_names = self.species_id_to_names(xref_dbi) @@ -80,17 +47,57 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: # Connect to the appropriate dbs if dba: - scripts_dir = args["perl_scripts_dir"] - xref_db_url = args["xref_db_url"] - source_ids_json = json.dumps(source_ids) - - logging.info( - f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl" - ) - perl_cmd = f"perl {scripts_dir}/refseq_coordinate_parser.pl --xref_db_url '{xref_db_url}' --core_db_url '{args['core_db_url']}' --otherf_db_url '{dba}' --source_ids '{source_ids_json}' --species_id {species_id} --species_name {species_name} --release {ensembl_release}" - cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE) - - return 0, "Added refseq_import xrefs." + return self.run_perl_script(args, source_ids, species_name) else: # Not all species have an otherfeatures database, skip if not found return 0, f"Skipped. No otherfeatures database for '{species_name}'." + + def get_source_ids(self, verbose: bool, xref_dbi: Connection) -> Dict[str, int]: + source_ids = { + "peptide": self.get_source_id_for_source_name("RefSeq_peptide", xref_dbi, "otherfeatures"), + "mrna": self.get_source_id_for_source_name("RefSeq_mRNA", xref_dbi, "otherfeatures"), + "ncrna": self.get_source_id_for_source_name("RefSeq_ncRNA", xref_dbi, "otherfeatures"), + "peptide_predicted": self.get_source_id_for_source_name("RefSeq_peptide_predicted", xref_dbi, "otherfeatures"), + "mrna_predicted": self.get_source_id_for_source_name("RefSeq_mRNA_predicted", xref_dbi, "otherfeatures"), + "ncrna_predicted": self.get_source_id_for_source_name("RefSeq_ncRNA_predicted", xref_dbi, "otherfeatures"), + "entrezgene": self.get_source_id_for_source_name("EntrezGene", xref_dbi), + "wikigene": self.get_source_id_for_source_name("WikiGene", xref_dbi), + } + + if verbose: + logging.info(f'RefSeq_peptide source ID = {source_ids["peptide"]}') + logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna"]}') + logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna"]}') + logging.info(f'RefSeq_peptide_predicted source ID = {source_ids["peptide_predicted"]}') + logging.info(f'RefSeq_mRNA_predicted source ID = {source_ids["mrna_predicted"]}') + logging.info(f'RefSeq_ncRNA_predicted source ID = {source_ids["ncrna_predicted"]}') + + return source_ids + + def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], species_name: str) -> Tuple[int, str]: + # For now, we run a perl script to add the xrefs, which has some mandatory arguments + scripts_dir = args.get("perl_scripts_dir") + xref_db_url = args.get("xref_db_url") + if not scripts_dir or not xref_db_url: + raise AttributeError("Missing required arguments: perl_scripts_dir and xref_db_url") + + source_ids_json = json.dumps(source_ids) + + logging.info(f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl") + perl_cmd = ( + f"perl {scripts_dir}/refseq_coordinate_parser.pl " + f"--xref_db_url '{xref_db_url}' " + f"--core_db_url '{args.get('core_db_url')}' " + f"--otherf_db_url '{args.get('dba')}' " + f"--source_ids '{source_ids_json}' " + f"--species_id {args.get('species_id')} " + f"--species_name {species_name} " + f"--release {args.get('ensembl_release')}" + ) + cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if cmd_output.returncode != 0: + logging.error(f"Perl script ({scripts_dir}/refseq_coordinate_parser.pl) failed with error: {cmd_output.stderr.decode('utf-8')}") + return 1, "Failed to add refseq_import xrefs." + + return 0, "Added refseq_import xrefs." diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py deleted file mode 100644 index 93d773270..000000000 --- a/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py +++ /dev/null @@ -1,341 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Parser module for RefSeq sources (dna and peptide).""" - -from ensembl.production.xrefs.parsers.BaseParser import * - - -class RefSeqGPFFParser(BaseParser): - def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - species_name = args["species_name"] - file = args["file"] - release_file = args["rel_file"] - xref_dbi = args["xref_dbi"] - verbose = args.get("verbose", False) - - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") - - # Get needed source ids - source_ids = { - "peptide_source_id": self.get_source_id_for_source_name( - "RefSeq_peptide", xref_dbi - ), - "mrna_source_id": self.get_source_id_for_source_name( - "RefSeq_mRNA", xref_dbi, "refseq" - ), - "ncrna_source_id": self.get_source_id_for_source_name( - "RefSeq_ncRNA", xref_dbi - ), - "pred_peptide_source_id": self.get_source_id_for_source_name( - "RefSeq_peptide_predicted", xref_dbi - ), - "pred_mrna_source_id": self.get_source_id_for_source_name( - "RefSeq_mRNA_predicted", xref_dbi, "refseq" - ), - "pred_ncrna_source_id": self.get_source_id_for_source_name( - "RefSeq_ncRNA_predicted", xref_dbi - ), - "entrez_source_id": self.get_source_id_for_source_name( - "EntrezGene", xref_dbi - ), - "wiki_source_id": self.get_source_id_for_source_name("WikiGene", xref_dbi), - } - - if verbose: - logging.info( - f'RefSeq_peptide source ID = {source_ids["peptide_source_id"]}' - ) - logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna_source_id"]}') - logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna_source_id"]}') - logging.info( - f'RefSeq_peptide_predicted source ID = {source_ids["pred_peptide_source_id"]}' - ) - logging.info( - f'RefSeq_mRNA_predicted source ID = {source_ids["pred_mrna_source_id"]}' - ) - logging.info( - f'RefSeq_ncRNA_predicted source ID = {source_ids["pred_ncrna_source_id"]}' - ) - logging.info(f'EntrezGene source ID = {source_ids["entrez_source_id"]}') - logging.info(f'WikiGene source ID = {source_ids["wiki_source_id"]}') - - # Extract version from release file - if release_file: - # Parse and set release info - index = 0 - for section in self.get_file_sections(release_file, "***"): - index += 1 - if index == 2: - release = "".join(section) - release = re.sub(r"\s{2,}", " ", release) - release = release.strip() - release = re.sub( - r".*(NCBI Reference Sequence.*) Distribution.*", r"\1", release - ) - release = re.sub(r"Release (\d+)", r"Release \1,", release) - break - - # Set releases - self.set_release(source_ids["peptide_source_id"], release, xref_dbi) - self.set_release(source_ids["mrna_source_id"], release, xref_dbi) - self.set_release(source_ids["ncrna_source_id"], release, xref_dbi) - self.set_release(source_ids["pred_mrna_source_id"], release, xref_dbi) - self.set_release(source_ids["pred_ncrna_source_id"], release, xref_dbi) - self.set_release(source_ids["pred_peptide_source_id"], release, xref_dbi) - - result_message = self.create_xrefs( - source_ids, species_id, species_name, file, xref_dbi - ) - - return 0, result_message - - def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name: str, file: str, dbi: Connection) -> str: - counts = { - "num_mrna": 0, - "num_ncrna": 0, - "num_pred_mrna": 0, - "num_pred_ncrna": 0, - "num_peptide": 0, - "num_pred_peptide": 0, - "num_entrez": 0, - "num_wiki": 0, - } - - # Create a dict of all valid names for this species - species_id_to_names = self.species_id_to_names(dbi) - if species_name: - species_id_to_names.setdefault(species_id, []).append(species_name) - if not species_id_to_names.get(species_id): - return "Skipped. Could not find species ID to name mapping" - names = species_id_to_names[species_id] - name_to_species_id = {name: species_id for name in names} - - # Create a dict of all valid taxon_ids for this species - species_id_to_tax = self.species_id_to_taxonomy(dbi) - species_id_to_tax.setdefault(species_id, []).append(species_id) - tax_ids = species_id_to_tax[species_id] - tax_to_species_id = {tax_id: species_id for tax_id in tax_ids} - - # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs - entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi) - refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi) - refseq_ids.update( - self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi) - ) - entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi) - wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi) - - # Get file type - file_type = self.type_from_file(os.path.basename(file)) - if not file_type: - return f"Could not work out sequence type for {file}" - - xrefs = [] - - # Read file - for section in self.get_file_sections(file, "//\n"): - if len(section) == 1: - continue - - entry = "".join(section) - xref = {} - - # Extract the species name - species_id_check = None - match = re.search(r"\s+ORGANISM\s+(.*)\n", entry) - if match: - species = match.group(1).lower() - species = re.sub(r"^\s*", "", species) - species = re.sub(r"\s*\(.+\)", "", species) - species = re.sub(r"\s+", "_", species) - species = re.sub(r"\n", "", species) - - species_id_check = name_to_species_id[species] - - # Try going through the taxon ID if species check didn't work - if not species_id_check: - match = re.search(r"db_xref=\"taxon:(\d+)\"", entry) - if match: - taxon_id = match.group(1) - species_id_check = tax_to_species_id[taxon_id] - - # Skip xrefs for species that aren't in the species table - if not species_id_check or species_id != species_id_check: - continue - - # Extract accession and version - accession = re.search( - r"^ACCESSION\s+(\S+)", entry, flags=re.MULTILINE - ).group(1) - version = re.search(r"^VERSION\s+(\S+)", entry, flags=re.MULTILINE).group(1) - - # Get the right source ID based on file type and whether this is predicted (X*) or not - source_id = 0 - if file_type == "dna": - if re.search(r"^XM_", accession): - source_id = source_ids["pred_mrna_source_id"] - counts["num_pred_mrna"] += 1 - elif re.search(r"^XR", accession): - source_id = source_ids["pred_ncrna_source_id"] - counts["num_pred_ncrna"] += 1 - elif re.search(r"^NM", accession): - source_id = source_ids["mrna_source_id"] - counts["num_mrna"] += 1 - elif re.search(r"^NR", accession): - source_id = source_ids["ncrna_source_id"] - counts["num_ncrna"] += 1 - elif file_type == "peptide": - if re.search(r"^XP_", accession): - source_id = source_ids["pred_peptide_source_id"] - counts["num_pred_peptide"] += 1 - else: - source_id = source_ids["peptide_source_id"] - counts["num_peptide"] += 1 - - if not source_id: - logging.warning( - f"Could not get source ID for file type {file_type} for accession {accession}" - ) - - (acc_no_version, version) = version.split(".") - xref["ACCESSION"] = accession - if accession == acc_no_version: - xref["VERSION"] = version - - # Extract description (may be multi-line) - description = re.search( - r"^DEFINITION\s+([^[]+)", entry, flags=re.MULTILINE - ).group(1) - description = re.sub(r"\nACCESSION.*", "", description, flags=re.DOTALL) - description = re.sub(r"\n", "", description) - description = re.sub(r"{.*}-like", "", description) - description = re.sub(r"{.*}", "", description) - description = re.sub(r"\s+", " ", description) - if len(description) > 255: - description = description[0:255] - - # Extract sequence - sequence = re.search( - r"^\s*ORIGIN\s+(.+)", entry, flags=re.DOTALL | re.MULTILINE - ).group(1) - sequence_lines = sequence.split("\n") - parsed_sequence = "" - for seq_line in sequence_lines: - if seq_line: - sequence_only = re.search(r"^\s*\d+\s+(.*)$", seq_line).group(1) - if not sequence_only: - continue - parsed_sequence += sequence_only - parsed_sequence = re.sub(r"\s", "", parsed_sequence) - - # Extract related pair to current RefSeq accession - # For rna file, the pair is the protein_id - # For peptide file, the pair is in DBSOURCE REFSEQ accession - refseq_pair = None - match = re.search(r"DBSOURCE\s+REFSEQ: accession (\S+)", entry) - if match: - refseq_pair = match.group(1) - protein_id = re.findall(r"\/protein_id=.(\S+_\d+)", entry) - coded_by = re.findall(r"\/coded_by=.(\w+_\d+)", entry) - - for cb in coded_by: - xref["PAIR"] = cb - - if not xref.get("PAIR"): - xref["PAIR"] = refseq_pair - - if not xref.get("PAIR"): - for pi in protein_id: - xref["PAIR"] = pi - - xref["LABEL"] = f"{accession}.{version}" - xref["DESCRIPTION"] = description - xref["SOURCE_ID"] = source_id - xref["SEQUENCE"] = parsed_sequence - xref["SEQUENCE_TYPE"] = file_type - xref["SPECIES_ID"] = species_id - xref["INFO_TYPE"] = "SEQUENCE_MATCH" - xref["DEPENDENT_XREFS"] = [] - - # Extrat NCBIGene ids - seen_in_record = {} - ncbi_gene_ids = re.findall(r"db_xref=.GeneID:(\d+)", entry) - for gene_id in ncbi_gene_ids: - if not seen_in_record.get(gene_id) and entrez_acc_to_label.get(gene_id): - seen_in_record[gene_id] = 1 - - dependent = {} - dependent["SOURCE_ID"] = source_ids["entrez_source_id"] - dependent["LINKAGE_SOURCE_ID"] = source_id - dependent["ACCESSION"] = gene_id - dependent["LABEL"] = entrez_acc_to_label[gene_id] - xref["DEPENDENT_XREFS"].append(dependent) - counts["num_entrez"] += 1 - - dependent = {} - dependent["SOURCE_ID"] = source_ids["wiki_source_id"] - dependent["LINKAGE_SOURCE_ID"] = source_id - dependent["ACCESSION"] = gene_id - dependent["LABEL"] = entrez_acc_to_label[gene_id] - xref["DEPENDENT_XREFS"].append(dependent) - counts["num_wiki"] += 1 - - # Add xrefs for RefSeq mRNA as well where available - if refseq_pair: - refseq_pair = re.sub(r"\.[0-9]*", "", refseq_pair) - if refseq_pair: - if refseq_ids.get(refseq_pair): - for refseq_id in refseq_ids[refseq_pair]: - for entrez_id in entrez_ids.get(gene_id): - self.add_dependent_xref_maponly( - entrez_id, - source_ids["entrez_source_id"], - refseq_id, - None, - dbi, - ) - for wiki_id in wiki_ids.get(gene_id): - self.add_dependent_xref_maponly( - wiki_id, - source_ids["entrez_source_id"], - refseq_id, - None, - dbi, - ) - - xrefs.append(xref) - - if len(xrefs) > 0: - self.upload_xref_object_graphs(xrefs, dbi) - - result_message = f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, {counts["num_ncrna"]} ncRNA xrefs, {counts["num_pred_ncrna"]} predicted ncRNA xrefs, {counts["num_peptide"]} peptide xrefs, and {counts["num_pred_peptide"]} predicted peptide xrefs\n' - result_message += f"Added the following dependent xrefs:\n" - result_message += f'\tEntrezGene\t{counts["num_entrez"]}\n' - result_message += f'\tWikiGene\t{counts["num_wiki"]}\n' - - return result_message - - def type_from_file(self, file_name: str) -> Optional[str]: - if re.search("RefSeq_protein", file_name): - return "peptide" - if re.search("rna", file_name): - return "dna" - if re.search("protein", file_name): - return "peptide" - - return None diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py new file mode 100644 index 000000000..f9e62c218 --- /dev/null +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py @@ -0,0 +1,316 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Parser module for RefSeq sources (dna and peptide).""" + +import os +import re +import logging +from typing import Any, Dict, Optional, Tuple +from sqlalchemy.engine import Connection + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser + +class RefSeqParser(BaseParser): + ORGAMISM_PATTERN = re.compile(r"\s+ORGANISM\s+(.*)\n") + TAXON_PATTERN = re.compile(r"db_xref=\"taxon:(\d+)\"") + ACCESSION_PATTERN = re.compile(r"^ACCESSION\s+(\S+)", re.MULTILINE) + VERSION_PATTERN = re.compile(r"^VERSION\s+(\S+)", re.MULTILINE) + TYPE_PATTERNS = { + "dna": { + re.compile(r"^XM_"): ("num_pred_mrna", "pred_mrna_source_id"), + re.compile(r"^XR"): ("num_pred_ncrna", "pred_ncrna_source_id"), + re.compile(r"^NM"): ("num_mrna", "mrna_source_id"), + re.compile(r"^NR"): ("num_ncrna", "ncrna_source_id"), + }, + "peptide": { + re.compile(r"^XP_"): ("num_pred_peptide", "pred_peptide_source_id"), + } + } + DESCRIPTION_PATTERN = re.compile(r"^DEFINITION\s+(.+?)(?=\n\S)", re.DOTALL | re.MULTILINE) + DESC_REMOVE_BRACES_PATTERN = re.compile(r"\{.*?\}-like|\{.*?\}") + NORMALIZE_WHITESPACE_PATTERN = re.compile(r"\s+") + SEQUENCE_PATTERN = re.compile(r"^\s*ORIGIN\s+(.+)", re.DOTALL | re.MULTILINE) + SEQ_REMOVE_NUMBERS_PATTERN = re.compile(r"\d+\s+") + PROTEIN_IDS_PATTERN = re.compile(r"\/protein_id=.(\S+_\d+)") + CODED_BY_PATTERN = re.compile(r"\/coded_by=.(\w+_\d+)") + DBSOURCE_PATTERN = re.compile(r"^DBSOURCE\s+REFSEQ: accession (\S+_\d+)") + GENEID_PATTERN = re.compile(r"db_xref=.GeneID:(\d+)") + + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: + source_id = args.get("source_id") + species_id = args.get("species_id") + species_name = args.get("species_name") + xref_file = args.get("file") + release_file = args.get("rel_file") + xref_dbi = args.get("xref_dbi") + verbose = args.get("verbose", False) + + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + # Get needed source ids + source_ids = { + "peptide_source_id": self.get_source_id_for_source_name("RefSeq_peptide", xref_dbi), + "mrna_source_id": self.get_source_id_for_source_name("RefSeq_mRNA", xref_dbi, "refseq"), + "ncrna_source_id": self.get_source_id_for_source_name("RefSeq_ncRNA", xref_dbi), + "pred_peptide_source_id": self.get_source_id_for_source_name("RefSeq_peptide_predicted", xref_dbi), + "pred_mrna_source_id": self.get_source_id_for_source_name("RefSeq_mRNA_predicted", xref_dbi, "refseq"), + "pred_ncrna_source_id": self.get_source_id_for_source_name("RefSeq_ncRNA_predicted", xref_dbi), + "entrez_source_id": self.get_source_id_for_source_name("EntrezGene", xref_dbi), + "wiki_source_id": self.get_source_id_for_source_name("WikiGene", xref_dbi), + } + + if verbose: + for key, value in source_ids.items(): + logging.info(f'{key} = {value}') + + # Extract version from release file + if release_file: + release = self.extract_release_info(release_file) + + if release: + if verbose: + logging.info(f"RefSeq release info: {release}") + + for key in ["peptide_source_id", "mrna_source_id", "ncrna_source_id", "pred_mrna_source_id", "pred_ncrna_source_id", "pred_peptide_source_id"]: + self.set_release(source_ids[key], release, xref_dbi) + + result_message = self.create_xrefs(source_ids, species_id, species_name, xref_file, xref_dbi) + return 0, result_message + + def extract_release_info(self, release_file: str) -> str: + release_info = "" + for section in self.get_file_sections(release_file, "***"): + release_info = "".join(section) + break + + match = re.search(r"(NCBI Reference Sequence.*?)(?=Distribution)", release_info, re.DOTALL) + if match: + release = " ".join(match.group(1).split()) + release = re.sub(r"Release (\d+)", r"Release \1,", release) + return release + + return None + + def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name: str, xref_file: str, dbi: Connection) -> str: + counts = { + "num_mrna": 0, + "num_ncrna": 0, + "num_pred_mrna": 0, + "num_pred_ncrna": 0, + "num_peptide": 0, + "num_pred_peptide": 0, + "num_entrez": 0, + "num_wiki": 0, + } + + # Create a dict of all valid names for this species + species_id_to_names = self.species_id_to_names(dbi) + if species_name: + species_id_to_names.setdefault(species_id, []).append(species_name) + if not species_id_to_names.get(species_id): + return "Skipped. Could not find species ID to name mapping" + names = species_id_to_names[species_id] + name_to_species_id = {name: species_id for name in names} + + # Create a dict of all valid taxon_ids for this species + species_id_to_tax = self.species_id_to_taxonomy(dbi) + species_id_to_tax.setdefault(species_id, []).append(species_id) + tax_ids = species_id_to_tax[species_id] + tax_to_species_id = {tax_id: species_id for tax_id in tax_ids} + + # Get file type + file_type = self.type_from_file(os.path.basename(xref_file)) + if not file_type: + return f"Skipped. Could not work out sequence type for {xref_file}" + + # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs + entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi) + refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi) + refseq_ids.update(self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi)) + entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi) + wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi) + + xrefs = [] + + # Read file + for section in self.get_file_sections(xref_file, "//\n"): + entry = "".join(section) + xref = {} + + # Extract the species name and check species ID + species_id_check = self.check_species(entry, name_to_species_id, tax_to_species_id) + + # Skip xrefs for species that don't pass the check + if not species_id_check or species_id != species_id_check: + continue + + # Extract accession + accession = self.ACCESSION_PATTERN.search(entry).group(1) + xref["ACCESSION"] = accession + + # Get the right source ID based on file type and whether this is predicted (X*) or not + source_id = self.get_source_id_for_accession(accession, file_type, source_ids, counts) + # result_message += f"{accession}--{source_id}|" + + if not source_id: + logging.warning(f"Could not get source ID for file type {file_type} for accession {accession}") + continue + + # Extract and fix the version + version = self.VERSION_PATTERN.search(entry).group(1) + acc_no_version, version = version.split(".", 1) if "." in version else (version, None) + if acc_no_version == accession and version is not None: + xref["VERSION"] = version + + # Extract description (may be multi-line) + description = self.extract_description(entry) + + # Extract sequence + parsed_sequence = self.extract_sequence(entry) + + # Extract related pair to current RefSeq accession + # - for rna file, the pair is the protein_id + # - for peptide file, the pair is in DBSOURCE REFSEQ accession or in the coded_by + xref["PAIR"] = self.extract_refseq_pair(file_type, entry) + + # Build the xref fields + xref["LABEL"] = f"{accession}.{version}" + xref["DESCRIPTION"] = description + xref["SOURCE_ID"] = source_id + xref["SEQUENCE"] = parsed_sequence + xref["SEQUENCE_TYPE"] = file_type + xref["SPECIES_ID"] = species_id + xref["INFO_TYPE"] = "SEQUENCE_MATCH" + xref["DEPENDENT_XREFS"] = [] + + # Extract NCBIGene ids + seen_in_record = {} + ncbi_gene_ids = self.GENEID_PATTERN.findall(entry) + for gene_id in ncbi_gene_ids: + if gene_id not in seen_in_record and gene_id in entrez_acc_to_label: + seen_in_record[gene_id] = True + entrez_label = entrez_acc_to_label[gene_id] + + self.add_dependents(xref, gene_id, source_ids, entrez_label, counts) + + if file_type == "peptide" and xref['PAIR']: + if refseq_ids.get(xref['PAIR']): + for refseq_id in refseq_ids[xref['PAIR']]: + for entrez_id in entrez_ids.get(gene_id, []): + self.add_dependent_xref_maponly(entrez_id, source_ids["entrez_source_id"], refseq_id, None, dbi) + for wiki_id in wiki_ids.get(gene_id, []): + self.add_dependent_xref_maponly(wiki_id, source_ids["wiki_source_id"], refseq_id, None, dbi) + + xrefs.append(xref) + + if xrefs: + self.upload_xref_object_graphs(xrefs, dbi) + + result_message = ( + f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, ' + f'{counts["num_ncrna"]} ncRNA xrefs, {counts["num_pred_ncrna"]} predicted ncRNA xrefs, ' + f'{counts["num_peptide"]} peptide xrefs, and {counts["num_pred_peptide"]} predicted peptide xrefs\n' + f'Added the following dependent xrefs:\n' + f'\tEntrezGene\t{counts["num_entrez"]}\n' + f'\tWikiGene\t{counts["num_wiki"]}\n' + ) + + return result_message + + def check_species(self, entry: str, name_to_species_id: Dict[str, int], tax_to_species_id: Dict[int, int]) -> Optional[int]: + species_id_check = None + + match = self.ORGAMISM_PATTERN.search(entry) + if match: + species = match.group(1).lower().strip() + species = re.sub(r"\s*\(.+\)", "", species) + species = re.sub(r"\s+", "_", species) + species_id_check = name_to_species_id.get(species) + + # Try going through the taxon ID if species check didn't work + if not species_id_check: + match = self.TAXON_PATTERN.search(entry) + if match: + taxon_id = int(match.group(1)) + species_id_check = tax_to_species_id.get(taxon_id) + + return species_id_check + + def get_source_id_for_accession(self, accession: str, file_type: str, source_ids: Dict[str, int], counts: Dict[str, int]) -> int: + # Check for dna or peptide patterns + if file_type in self.TYPE_PATTERNS: + for pattern, (count_key, source_id_key) in self.TYPE_PATTERNS[file_type].items(): + if pattern.search(accession): + counts[count_key] += 1 + return source_ids[source_id_key] + + # Default case for peptide + if file_type == "peptide": + counts["num_peptide"] += 1 + return source_ids["peptide_source_id"] + + return 0 + + def extract_description(self, entry: str) -> str: + description = self.DESCRIPTION_PATTERN.search(entry).group(1).strip() + description = self.DESC_REMOVE_BRACES_PATTERN.sub("", description) + description = self.NORMALIZE_WHITESPACE_PATTERN.sub(" ", description) + + return description[:255].strip() + + def extract_sequence(self, entry: str) -> str: + sequence = self.SEQUENCE_PATTERN.search(entry).group(1) + sequence = self.SEQ_REMOVE_NUMBERS_PATTERN.sub("", sequence) + parsed_sequence = "".join(self.NORMALIZE_WHITESPACE_PATTERN.sub("", seq_line) for seq_line in sequence.split("\n") if seq_line) + + return parsed_sequence + + def extract_refseq_pair(self, file_type:str, entry: str) -> Optional[str]: + if file_type == "dna": + protein_ids = self.PROTEIN_IDS_PATTERN.findall(entry) + if protein_ids: + return protein_ids[-1] + elif file_type == "peptide": + coded_by = self.CODED_BY_PATTERN.findall(entry) + if coded_by: + return coded_by[-1] + + match = self.DBSOURCE_PATTERN.search(entry) + if match: + return match.group(1) + + return None + + def add_dependents(self, xref: Dict[str, Any], gene_id: str, source_ids: Dict[str, int], label: str, counts: Dict[str, int]) -> None: + # Add EntrezGene and WikiGene dependent xrefs + for source_key in ["entrez", "wiki"]: + dependent = { + "SOURCE_ID": source_ids[f"{source_key}_source_id"], + "LINKAGE_SOURCE_ID": xref["SOURCE_ID"], + "ACCESSION": gene_id, + "LABEL": label, + } + xref["DEPENDENT_XREFS"].append(dependent) + counts[f"num_{source_key}"] += 1 + + def type_from_file(self, file_name: str) -> Optional[str]: + if re.search("RefSeq_protein", file_name): + return "peptide" + if re.search("rna", file_name): + return "dna" + if re.search("protein", file_name): + return "peptide" + return None diff --git a/src/python/ensembl/production/xrefs/parsers/UCSCParser.py b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py index 5de152912..5759b782c 100644 --- a/src/python/ensembl/production/xrefs/parsers/UCSCParser.py +++ b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py @@ -14,49 +14,73 @@ """Parser module for UCSC source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import csv +import re +from typing import Any, Dict, Tuple +from sqlalchemy.sql import insert +from sqlalchemy.engine import Connection +from ensembl.xrefs.xref_update_db_model import CoordinateXref as CoordinateXrefORM + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class UCSCParser(BaseParser): + CHROMOSOME_PATTERN = re.compile(r"\Achr") + EXON_PATTERN = re.compile(r",\Z") + EXON_SPLIT_PATTERN = re.compile(r"\s*,\s*") + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") - count = 0 + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"UCSC file is empty") + file_io.seek(0) + + csv_reader = csv.reader(file_io, delimiter="\t", strict=True) + + count = self.process_lines(csv_reader, source_id, species_id, xref_dbi) + + result_message = f"Loaded a total of {count} UCSC xrefs" + return 0, result_message - file_io = self.get_filehandle(file) - csv_reader = csv.reader(file_io, delimiter="\t", strict=True) + def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection) -> int: + count = 0 # Read lines for line in csv_reader: - chromosome = line[1] - strand = line[2] - tx_start = int(line[3]) - tx_end = int(line[4]) - cds_start = int(line[5]) - cds_end = int(line[6]) - exon_starts = line[8] - exon_ends = line[9] - accession = line[11] + try: + chromosome = line[1].strip() + strand = line[2].strip() + exon_starts = line[8].strip() + exon_ends = line[9].strip() + accession = line[11].strip() + + tx_start = int(line[3]) if line[3].strip() else None + tx_end = int(line[4]) if line[4].strip() else None + cds_start = int(line[5]) if line[5].strip() else None + cds_end = int(line[6]) if line[6].strip() else None + + # Check for required keys + if not accession or not chromosome or not strand or tx_start is None or tx_end is None or not exon_starts or not exon_ends: + raise ValueError("Missing required key for xref") + except (IndexError, ValueError) as e: + raise ValueError(f"Error processing line {line}: {e}") # UCSC uses slightly different chromosome names, at least for # human and mouse, so chop off the 'chr' in the beginning. We do # not yet translate the names of the special chromosomes, e.g. # "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl) - chromosome = re.sub(r"\Achr", "", chromosome) + chromosome = self.CHROMOSOME_PATTERN.sub("", chromosome) # They also use '+' and '-' for the strand, instead of -1, 0, or 1 - if strand == "+": - strand = 1 - elif strand == "-": - strand = -1 - else: - strand = 0 + strand = 1 if strand == "+" else -1 if strand == "-" else 0 # ... and non-coding transcripts have cds_start == cds_end. # We would like these to be stored as NULLs @@ -65,8 +89,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: cds_end = None # exon_starts and exon_ends usually have trailing commas, remove them - exon_starts = re.sub(r",\Z", "", exon_starts) - exon_ends = re.sub(r",\Z", "", exon_ends) + exon_starts = self.EXON_PATTERN.sub("", exon_starts) + exon_ends = self.EXON_PATTERN.sub("", exon_ends) # ... and they use the same kind of "inbetween" coordinates as e.g. # exonerate, so increment all start coordinates by one @@ -80,57 +104,24 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: # element of the resulting array, then join the result into a new # comma-separated list exon_starts = ",".join( - str(int(x) + 1) for x in re.split(r"\s*,\s*", exon_starts) + str(int(x) + 1) for x in self.EXON_SPLIT_PATTERN.split(exon_starts) ) - self.add_xref( - source_id, - species_id, - { - "accession": accession, - "chromosome": chromosome, - "strand": strand, - "txStart": tx_start, - "txEnd": tx_end, - "cdsStart": cds_start, - "cdsEnd": cds_end, - "exonStarts": exon_starts, - "exonEnds": exon_ends, - }, - xref_dbi, + # Add coordinate xref + query = insert(CoordinateXrefORM).values( + source_id=source_id, + species_id=species_id, + accession=accession, + chromosome=chromosome, + strand=strand, + txStart=tx_start, + txEnd=tx_end, + cdsStart=cds_start, + cdsEnd=cds_end, + exonStarts=exon_starts, + exonEnds=exon_ends, ) + xref_dbi.execute(query) count += 1 - file_io.close() - - result_message = f"Loaded a total of {count} UCSC xrefs" - - return 0, result_message - - def add_xref(self, source_id: int, species_id: int, xref: Dict[str, Any], dbi: Connection) -> None: - for required_key in [ - "accession", - "chromosome", - "strand", - "txStart", - "txEnd", - "exonStarts", - "exonEnds", - ]: - if not xref.get(required_key): - raise KeyError(f"Missing required key {required_key} for Xref") - - query = insert(CoordinateXrefORM).values( - source_id=source_id, - species_id=species_id, - accession=xref["accession"], - chromosome=xref["chromosome"], - strand=xref["strand"], - txStart=xref["txStart"], - txEnd=xref["txEnd"], - cdsStart=xref["cdsStart"], - cdsEnd=xref["cdsEnd"], - exonStarts=xref["exonStarts"], - exonEnds=xref["exonEnds"], - ) - dbi.execute(query) + return count \ No newline at end of file diff --git a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py index e99b33cdc..1886c6fc6 100644 --- a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py +++ b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py @@ -14,100 +14,112 @@ """Parser module for Uniprot sources.""" -from ensembl.production.xrefs.parsers.BaseParser import * - +import re +import logging +import csv import codecs +from typing import Dict, Any, Tuple, List +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class UniProtParser(BaseParser): + SWISSPROT_RELEASE_PATTERN = re.compile(r"(UniProtKB/Swiss-Prot Release .*)") + TREMBL_RELEASE_PATTERN = re.compile(r"(UniProtKB/TrEMBL Release .*)") + TAXON_PATTERN = re.compile(r"[a-zA-Z_]+=([0-9 ,]+).*;") + CAUTION_PATTERN = re.compile(r"CAUTION: The sequence shown here is derived from an Ensembl") + SP_TYPE_PATTERN = re.compile(r"(\w+)\s+(\w+)") + PROTEIN_EVIDENCE_PATTERN = re.compile(r"(\d+)") + VERSION_PATTERN = re.compile(r"\d+-\w+-\d+, entry version (\d+)") + REVIEWED_PATTERN = re.compile(r"^Reviewed", re.IGNORECASE) + UNREVIEWED_PATTERN = re.compile(r"Unreviewed", re.IGNORECASE) + DESCRIPTION_PATTERN = re.compile(r"(RecName|SubName): Full=(.*)") + ECO_PATTERN = re.compile(r"\s*\{ECO:.*?\}") + EC_PATTERN = re.compile(r"EC=([^;]+)") + SEQUENCE_PATTERN = re.compile(r"^SEQUENCE") + WHITESPACE_PATTERN = re.compile(r"\s+") + GENE_NAME_PATTERN = re.compile(r"Name=(.*)") + SYNONYMS_PATTERN = re.compile(r"Synonyms=(.*)") + SYNONYMS_COMMA_PATTERN = re.compile(r"\s*,\s*") + DEPENDENTS_PATTERN = re.compile(r"^(GO|UniGene|RGD|CCDS|IPI|UCSC|SGD|HGNC|MGI|VGNC|Orphanet|ArrayExpress|GenomeRNAi|EPD|Xenbase|Reactome|MIM|GeneCards)") + STABLE_ID_PATTERN = re.compile(r"\.[0-9]+") + PROTEIN_ID_PATTERN = re.compile(r"([^.]+)\.([^.]+)") + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] - release_file = args["rel_file"] - verbose = args.get("verbose", False) - hgnc_file = args.get("hgnc_file") + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + release_file = args.get("rel_file") + verbose = args.get("verbose", False) + hgnc_file = args.get("hgnc_file") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") # Get needed source ids + source_ids = self.get_source_ids(xref_dbi, verbose) + + # Parse and set release info + self.set_release_info(release_file, source_ids, xref_dbi, verbose) + + result_message = self.create_xrefs(source_ids, species_id, xref_file, xref_dbi, hgnc_file) + return 0, result_message + + def get_source_ids(self, dbi: Connection, verbose: bool) -> Dict[str, int]: + source_names = { + "sp_source_id": ("Uniprot/SWISSPROT", "sequence_mapped"), + "sptr_source_id": ("Uniprot/SPTREMBL", "sequence_mapped"), + "sptr_non_display_source_id": ("Uniprot/SPTREMBL", "protein_evidence_gt_2"), + "sp_direct_source_id": ("Uniprot/SWISSPROT", "direct"), + "sptr_direct_source_id": ("Uniprot/SPTREMBL", "direct"), + "isoform_source_id": ("Uniprot_isoform", None), + } + source_ids = { - "sp_source_id": self.get_source_id_for_source_name( - "Uniprot/SWISSPROT", xref_dbi, "sequence_mapped" - ), - "sptr_source_id": self.get_source_id_for_source_name( - "Uniprot/SPTREMBL", xref_dbi, "sequence_mapped" - ), - "sptr_non_display_source_id": self.get_source_id_for_source_name( - "Uniprot/SPTREMBL", xref_dbi, "protein_evidence_gt_2" - ), - "sp_direct_source_id": self.get_source_id_for_source_name( - "Uniprot/SWISSPROT", xref_dbi, "direct" - ), - "sptr_direct_source_id": self.get_source_id_for_source_name( - "Uniprot/SPTREMBL", xref_dbi, "direct" - ), - "isoform_source_id": self.get_source_id_for_source_name( - "Uniprot_isoform", xref_dbi - ), + key: self.get_source_id_for_source_name(name, dbi, type) + for key, (name, type) in source_names.items() } if verbose: - logging.info(f'SwissProt source ID = {source_ids["sp_source_id"]}') - logging.info(f'SpTREMBL source ID = {source_ids["sptr_source_id"]}') - logging.info( - f'SpTREMBL protein_evidence > 2 source ID = {source_ids["sptr_non_display_source_id"]}' - ) - logging.info( - f'SwissProt direct source ID = {source_ids["sp_direct_source_id"]}' - ) - logging.info( - f'SpTREMBL direct source ID = {source_ids["sptr_direct_source_id"]}' - ) + for key, value in source_ids.items(): + logging.info(f'{key} = {value}') - # Parse and set release info - if release_file: - sp_release = None - sptr_release = None + return source_ids + + def set_release_info(self, release_file: str, source_ids: Dict[str, int], dbi: Connection, verbose: bool) -> None: + if not release_file: + return + + sp_release = None + sptr_release = None - release_io = self.get_filehandle(release_file) + with self.get_filehandle(release_file) as release_io: for line in release_io: line = line.strip() if not line: continue - match = re.search(r"(UniProtKB/Swiss-Prot Release .*)", line) + match = self.SWISSPROT_RELEASE_PATTERN.search(line) if match: sp_release = match.group(1) if verbose: logging.info(f"Swiss-Prot release is {sp_release}") else: - match = re.search(r"(UniProtKB/TrEMBL Release .*)", line) + match = self.TREMBL_RELEASE_PATTERN.search(line) if match: sptr_release = match.group(1) if verbose: logging.info(f"SpTrEMBL release is {sptr_release}") - release_io.close() - - # Set releases - self.set_release(source_ids["sp_source_id"], sp_release, xref_dbi) - self.set_release(source_ids["sptr_source_id"], sptr_release, xref_dbi) - self.set_release( - source_ids["sptr_non_display_source_id"], sptr_release, xref_dbi - ) - self.set_release(source_ids["sp_direct_source_id"], sp_release, xref_dbi) - self.set_release( - source_ids["sptr_direct_source_id"], sptr_release, xref_dbi - ) - - result_message = self.create_xrefs(source_ids, species_id, file, xref_dbi, hgnc_file) - - return 0, result_message + # Set releases + self.set_release(source_ids["sp_source_id"], sp_release, dbi) + self.set_release(source_ids["sptr_source_id"], sptr_release, dbi) + self.set_release(source_ids["sptr_non_display_source_id"], sptr_release, dbi) + self.set_release(source_ids["sp_direct_source_id"], sp_release, dbi) + self.set_release(source_ids["sptr_direct_source_id"], sptr_release, dbi) - def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, dbi: Connection, hgnc_file: str = None) -> str: + def create_xrefs(self, source_ids: Dict[str, int], species_id: int, xref_file: str, dbi: Connection, hgnc_file: str = None) -> str: counts = { "num_sp": 0, "num_sptr": 0, @@ -120,7 +132,7 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d ensembl_derived_protein_count = 0 count = 0 - # Get sources ids of dependent sources + # Get source ids of dependent sources dependent_sources = self.get_xref_sources(dbi) # Extract descriptions from hgnc @@ -137,71 +149,62 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d xrefs = [] # Read file - for section in self.get_file_sections(file, "//\n"): - if len(section) == 1: - continue - - entry = "".join(section) + for section in self.get_file_sections(xref_file, "//\n"): + entry = self.extract_entry_fields(section) xref = {} # Extract the species taxon id - found = 0 - match = re.search(r"OX\s+[a-zA-Z_]+=([0-9 ,]+).*;", entry) + found = False + match = self.TAXON_PATTERN.search(entry["OX"][0]) if match: ox = match.group(1) for taxon_id_from_file in ox.split(", "): - taxon_id_from_file = re.sub(r"\s", "", taxon_id_from_file) - if tax_to_species_id.get(taxon_id_from_file): - found = 1 + taxon_id_from_file = taxon_id_from_file.strip() + if tax_to_species_id.get(int(taxon_id_from_file)): + found = True count += 1 - # If no taxon_id's match, skip to next record + # If no taxon_id match found, skip to next record if not found: continue # Check for CC (caution) lines containing certain text # If sequence is from Ensembl, do not use - ensembl_derived_protein = 0 - if re.search( - r"CAUTION: The sequence shown here is derived from an Ensembl", entry - ): - ensembl_derived_protein = 1 - ensembl_derived_protein_count += 1 + ensembl_derived_protein = False + for comment in entry.get("CC", []): + ensembl_derived_protein = bool(self.CAUTION_PATTERN.search(comment)) + if ensembl_derived_protein: + ensembl_derived_protein_count += 1 + break # Extract ^AC lines and build list of accessions - accessions = [] - accessions_only = re.findall(r"\nAC\s+(.+)", entry) - for accessions_line in accessions_only: - for acc in accessions_line.split(";"): - acc = acc.strip() - if acc: - accessions.append(acc) + accessions = [acc.strip() for acc in entry["AC"][0].split(";") if acc.strip()] accession = accessions[0] if accession.lower() == "unreviewed": - logging.warn( - f"WARNING: entries with accession of {accession} not allowed, will be skipped" - ) + logging.warning(f"WARNING: entries with accession of {accession} not allowed, will be skipped") continue + # Starting building xref object xref["ACCESSION"] = accession xref["INFO_TYPE"] = "SEQUENCE_MATCH" - xref["SYNONYMS"] = [] - for i in range(1, len(accessions)): - xref["SYNONYMS"].append(accessions[i]) - - sp_type = re.search(r"ID\s+(\w+)\s+(\w+)", entry).group(2) - protein_evidence_code = re.search(r"PE\s+(\d+)", entry).group(1) - version = re.search(r"DT\s+\d+-\w+-\d+, entry version (\d+)", entry).group( - 1 - ) - - # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here - if re.search(r"^Reviewed", sp_type, re.IGNORECASE): + xref["SYNONYMS"] = accessions[1:] + + # Extract the type, protein evidence code and version + sp_type = self.SP_TYPE_PATTERN.search(entry["ID"][0]).group(2) + protein_evidence_code = self.PROTEIN_EVIDENCE_PATTERN.search(entry["PE"][0]).group(1) + for dt_line in entry.get("DT", []): + match = self.VERSION_PATTERN.search(dt_line) + if match: + version = match.group(1) + break + + # SwissProt/SPTrEMBL are differentiated by having Reviewed/Unreviewed here + if self.REVIEWED_PATTERN.search(sp_type): xref["SOURCE_ID"] = source_ids["sp_source_id"] counts["num_sp"] += 1 - elif re.search(r"Unreviewed", sp_type, re.IGNORECASE): - # Use normal source only if it is PE levels 1 & 2 + elif self.UNREVIEWED_PATTERN.search(sp_type): + # Use normal source only if PE levels 1 & 2 if protein_evidence_code and int(protein_evidence_code) < 3: xref["SOURCE_ID"] = source_ids["sptr_source_id"] counts["num_sptr"] += 1 @@ -220,194 +223,123 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d xref["DEPENDENT_XREFS"] = [] xref["DIRECT_XREFS"] = [] - # Extract ^DE lines only and build cumulative description string - description = "" - description_lines = re.findall(r"\nDE\s+(.+)", entry) - for line in description_lines: - match = re.search(r"RecName: Full=(.*);", line) - if match: - if description: - description += "; " - description += match.group(1) - else: - match = re.search(r"SubName: Full=(.*);", line) - if match: - if description: - description += "; " - description += match.group(1) - - description = re.sub(r"^\s*", "", description) - description = re.sub(r"\s*$", "", description) - description = re.sub(r"\s*\{ECO:.*?\}", "", description) - - # Parse the EC_NUMBER line, only for S.cerevisiae for now - if re.search(r"EC=", line) and species_id == "4932": - # Get the EC Number and make it an xref for S.cer if any - EC = re.search(r"\s*EC=([^;]+);", line).group(1) - - dependent = {} - dependent["LABEL"] = EC - dependent["ACCESSION"] = EC - dependent["SOURCE_NAME"] = "EC_NUMBER" - dependent["SOURCE_ID"] = dependent_sources["EC_NUMBER"] - dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] - xref["DEPENDENT_XREFS"].append(dependent) - dependent_xrefs_counts["EC_NUMBER"] = ( - dependent_xrefs_counts.get("EC_NUMBER", 0) + 1 - ) - + # Extract the description + description, ec_number = self.extract_description("".join(entry.get("DE", []))) xref["DESCRIPTION"] = description + # Parse the EC_NUMBER, only for S.cerevisiae for now + if ec_number and species_id == 4932: + dependent = {} + dependent["LABEL"] = ec_number + dependent["ACCESSION"] = ec_number + dependent["SOURCE_NAME"] = "EC_NUMBER" + dependent["SOURCE_ID"] = dependent_sources["EC_NUMBER"] + dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] + xref["DEPENDENT_XREFS"].append(dependent) + dependent_xrefs_counts["EC_NUMBER"] = (dependent_xrefs_counts.get("EC_NUMBER", 0) + 1) + # Extract sequence - sequence = re.search(r"SQ\s+(.+)", entry, flags=re.DOTALL).group(1) - sequence = re.sub(r"\n", "", sequence) - sequence = re.sub(r"\/\/", "", sequence) - sequence = re.sub(r"\s", "", sequence) - sequence = re.sub(r"^.*;", "", sequence) + sequence = "" + for seq_line in entry.get("SQ", []): + if not self.SEQUENCE_PATTERN.search(seq_line): + sequence += seq_line + sequence = self.WHITESPACE_PATTERN.sub("", sequence) xref["SEQUENCE"] = sequence # Extract gene names - gene_names = re.findall(r"\nGN\s+(.+)", entry) - gene_names = " ".join(gene_names).split(";") - - # Do not allow the addition of UniProt Gene Name dependent Xrefs - # if the protein was imported from Ensembl. Otherwise we will - # re-import previously set symbols - if not ensembl_derived_protein: - dependent = {} - name_found = 0 - gene_name = None - dep_synonyms = [] - for line in gene_names: - line = line.strip() - - if not re.search(r"Name=", line) and not re.search( - r"Synonyms=", line - ): - continue - - match = re.search(r"Name=([A-Za-z0-9_\-\.\s]+)", line) - if match and not name_found: - gene_name = match.group(1).rstrip() - gene_name = re.sub(r"\nGN", "", gene_name) - name_found = 1 - - match = re.search(r"Synonyms=(.*)", line) - if match: - synonym = match.group(1) - synonym = re.sub(r"\{.*?\}", "", synonym) - synonym = re.sub(r"\s+$", "", synonym) - synonym = re.sub(r"\s*,\s*", ",", synonym) - synonyms = synonym.split(",") - for synonym in synonyms: - if synonym not in dep_synonyms: - dep_synonyms.append(synonym) + if not ensembl_derived_protein and entry.get("GN"): + gene_name, gene_synonyms = self.extract_gene_name(" ".join(entry["GN"])) + # Add dependent xref for gene name if gene_name: + dependent = {} dependent["LABEL"] = gene_name dependent["ACCESSION"] = xref["ACCESSION"] dependent["SOURCE_NAME"] = "Uniprot_gn" dependent["SOURCE_ID"] = dependent_sources["Uniprot_gn"] dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] - dependent["SYNONYMS"] = dep_synonyms - if hgnc_file and hgnc_descriptions.get(gene_name) is not None: + dependent["SYNONYMS"] = gene_synonyms + if hgnc_file and hgnc_descriptions.get(gene_name): dependent["DESCRIPTION"] = hgnc_descriptions[gene_name] xref["DEPENDENT_XREFS"].append(dependent) - dependent_xrefs_counts["Uniprot_gn"] = ( - dependent_xrefs_counts.get("Uniprot_gn", 0) + 1 - ) + dependent_xrefs_counts["Uniprot_gn"] = dependent_xrefs_counts.get("Uniprot_gn", 0) + 1 # Dependent xrefs - only store those that are from sources listed in the source table - deps = re.findall(r"\n(DR\s+.+)", entry) - seen = {} - for dep in deps: - match = re.search(r"^DR\s+(.+)", dep) - if match: - vals = re.split(r";\s*", match.group(1)) - source = vals[0] - acc = vals[1] - extra = [] - if len(vals) > 2: - extra = vals[2 : len(vals)] - - # Skip external sources obtained through other files - if re.search( - r"^(GO|UniGene|RGD|CCDS|IPI|UCSC|SGD|HGNC|MGI|VGNC|Orphanet|ArrayExpress|GenomeRNAi|EPD|Xenbase|Reactome|MIM|GeneCards)", - source, - ): - continue - - # If mapped to Ensembl, add as direct xref - if source == "Ensembl": - direct = {} - isoform = {} - - stable_id = extra[0] - stable_id = re.sub(r"\.[0-9]+", "", stable_id) - direct["STABLE_ID"] = stable_id - direct["ENSEMBL_TYPE"] = "Translation" - direct["LINKAGE_TYPE"] = "DIRECT" - if xref["SOURCE_ID"] == source_ids["sp_source_id"]: - direct["SOURCE_ID"] = source_ids["sp_direct_source_id"] - counts["num_direct_sp"] += 1 - else: - direct["SOURCE_ID"] = source_ids["sptr_direct_source_id"] - counts["num_direct_sptr"] += 1 - xref["DIRECT_XREFS"].append(direct) - - match = re.search(r"(%s-[0-9]+)" % accession, extra[1]) - if match: - isoform = match.group(1) - self.add_to_direct_xrefs( - { - "stable_id": stable_id, - "ensembl_type": "translation", - "accession": isoform, - "label": isoform, - "source_id": source_ids["isoform_source_id"], - "linkage": "DIRECT", - "species_id": species_id, - }, - dbi, - ) - counts["num_isoform"] += 1 - - # Create dependent xref structure & store it - if dependent_sources.get(source): - dependent = {} - - dependent["SOURCE_NAME"] = source - dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] - dependent["SOURCE_ID"] = dependent_sources[source] - dependent["ACCESSION"] = acc - - if not seen.get(f"{source}:{acc}"): + for dependent_line in entry.get("DR", []): + vals = re.split(r";\s*", dependent_line) + source = vals[0] + dependent_acc = vals[1] + extra = vals[2:] if len(vals) > 2 else [] + + # Skip external sources obtained through other files + if self.DEPENDENTS_PATTERN.search(source): + continue + + # If mapped to Ensembl, add as direct xref + if source == "Ensembl": + stable_id = self.STABLE_ID_PATTERN.sub("", extra[0]) + + direct = {} + direct["STABLE_ID"] = stable_id + direct["ENSEMBL_TYPE"] = "Translation" + direct["LINKAGE_TYPE"] = "DIRECT" + if xref["SOURCE_ID"] == source_ids["sp_source_id"]: + direct["SOURCE_ID"] = source_ids["sp_direct_source_id"] + counts["num_direct_sp"] += 1 + else: + direct["SOURCE_ID"] = source_ids["sptr_direct_source_id"] + counts["num_direct_sptr"] += 1 + xref["DIRECT_XREFS"].append(direct) + + match = re.search(r"(%s-[0-9]+)" % accession, extra[1]) + if match: + isoform = match.group(1) + + xref_id = self.add_xref( + { + "accession": isoform, + "label": isoform, + "source_id": source_ids["isoform_source_id"], + "species_id": species_id, + "info_type": "DIRECT", + }, + dbi, + ) + self.add_direct_xref(xref_id, stable_id, "translation", "DIRECT", dbi) + counts["num_isoform"] += 1 + + # Create dependent xref structure & store it + if dependent_sources.get(source): + # Only add depenedent accession once for record + if not seen.get(f"{source}:{dependent_acc}"): + dependent = { + "SOURCE_NAME": source, + "LINKAGE_SOURCE_ID": xref["SOURCE_ID"], + "SOURCE_ID": dependent_sources[source], + "ACCESSION": dependent_acc, + } + + xref["DEPENDENT_XREFS"].append(dependent) + dependent_xrefs_counts[source] = dependent_xrefs_counts.get(source, 0) + 1 + seen[f"{source}:{dependent_acc}"] = True + + # For EMBL source, add protein_id as dependent xref + if source == "EMBL": + protein_id = extra[0] + if protein_id != "-" and not seen.get(f"{source}:{protein_id}"): + protein_id_acc = self.PROTEIN_ID_PATTERN.search(protein_id).group(1) + dependent = { + "SOURCE_NAME": source, + "SOURCE_ID": dependent_sources["protein_id"], + "LINKAGE_SOURCE_ID": xref["SOURCE_ID"], + "LABEL": protein_id, + "ACCESSION": protein_id_acc, + } + xref["DEPENDENT_XREFS"].append(dependent) - dependent_xrefs_counts[source] = ( - dependent_xrefs_counts.get(source, 0) + 1 - ) - seen[f"{source}:{acc}"] = 1 - - if re.search(r"EMBL", dep) and not re.search(r"ChEMBL", dep): - protein_id = extra[0] - if protein_id != "-" and not seen.get( - f"{source}:{protein_id}" - ): - dependent = {} - - dependent["SOURCE_NAME"] = source - dependent["SOURCE_ID"] = dependent_sources["protein_id"] - dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"] - dependent["LABEL"] = protein_id - dependent["ACCESSION"] = re.search( - r"([^.]+)\.([^.]+)", protein_id - ).group(1) - xref["DEPENDENT_XREFS"].append(dependent) - dependent_xrefs_counts[source] = ( - dependent_xrefs_counts.get(source, 0) + 1 - ) - seen[f"{source}:{protein_id}"] = 1 + dependent_xrefs_counts["protein_id"] = dependent_xrefs_counts.get("protein_id", 0) + 1 + seen[f"{source}:{protein_id}"] = True xrefs.append(xref) @@ -416,26 +348,101 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d count = 0 xrefs.clear() - if len(xrefs) > 0: + if xrefs: self.upload_xref_object_graphs(xrefs, dbi) - result_message = f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, and {counts["num_sptr_non_display"]} SPTrEMBL xrefs with protein evidence codes > 2 from {file}\n' - result_message += f'Added {counts["num_direct_sp"]} direct SwissProt xrefs and {counts["num_direct_sptr"]} direct SPTrEMBL xrefs\n' - result_message += f'Added {counts["num_isoform"]} direct isoform xrefs\n' - result_message += f"Skipped {ensembl_derived_protein_count} ensembl annotations as Gene names\n" - - result_message += f"Added the following dependent xrefs:\n" + result_message = ( + f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, ' + f'and {counts["num_sptr_non_display"]} SPTrEMBL xrefs with protein evidence codes > 2 from {xref_file}\n' + f'Added {counts["num_direct_sp"]} direct SwissProt xrefs and {counts["num_direct_sptr"]} direct SPTrEMBL xrefs\n' + f'Added {counts["num_isoform"]} direct isoform xrefs\n' + f'Skipped {ensembl_derived_protein_count} ensembl annotations as Gene names\n' + f'Added the following dependent xrefs:\n' + ) for xref_source, xref_count in dependent_xrefs_counts.items(): result_message += f"\t{xref_source}\t{xref_count}\n" return result_message + def extract_entry_fields(self, section: str) -> Dict[str, List[str]]: + entry_dict = {} + in_sq_section = False + + for line in section: + line = line.strip() + if not line: + continue + + line_key = line[:2] + clean_line = line[2:].strip() + + if line_key == "SQ": + in_sq_section = True + elif in_sq_section: + line_key = "SQ" + clean_line = line + + entry_dict.setdefault(line_key, []).append(clean_line) + + return entry_dict + + def extract_description(self, full_description: str) -> Tuple[str, str]: + descriptions = [] + ec_number = None + description = "" + + description_lines = full_description.split(";") + for line in description_lines: + if not line.strip(): + continue + + match = self.DESCRIPTION_PATTERN.search(line) + if match: + descriptions.append(match.group(2)) + + # Get the EC number, if present + match = self.EC_PATTERN.search(line) + if match: + ec_number = match.group(1) + ec_number = self.ECO_PATTERN.sub("", ec_number).strip() + + if descriptions: + description = "; ".join(descriptions) + description = self.ECO_PATTERN.sub("", description).strip() + + return description, ec_number + + def extract_gene_name(self, full_gene_names: str) -> Tuple[str, List[str]]: + name_found = False + gene_name = None + synonyms_list = [] + + gene_name_lines = full_gene_names.split(";") + for line in gene_name_lines: + if not line.strip(): + continue + + match = self.GENE_NAME_PATTERN.search(line) + if match and not name_found: + gene_name = match.group(1) + gene_name = self.ECO_PATTERN.sub("", gene_name).strip() + name_found = True + + match = self.SYNONYMS_PATTERN.search(line) + if match: + synonyms = match.group(1) + synonyms = self.ECO_PATTERN.sub("", synonyms).strip() + synonyms = self.SYNONYMS_COMMA_PATTERN.sub(",", synonyms) + synonyms_list = synonyms.split(",") + + return gene_name, synonyms_list + def get_hgnc_descriptions(self, hgnc_file: str) -> Dict[str, str]: descriptions = {} # Make sure the file is utf8 hgnc_file = codecs.encode(hgnc_file, "utf-8").decode("utf-8") - hgnc_file = re.sub(r'"', '', hgnc_file) + hgnc_file = re.sub(r'"', "", hgnc_file) hgnc_io = self.get_filehandle(hgnc_file) csv_reader = csv.DictReader(hgnc_io, delimiter="\t") @@ -449,4 +456,4 @@ def get_hgnc_descriptions(self, hgnc_file: str) -> Dict[str, str]: hgnc_io.close() - return descriptions \ No newline at end of file + return descriptions diff --git a/src/python/ensembl/production/xrefs/parsers/VGNCParser.py b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py index 21cb13d58..ffab8f1c2 100644 --- a/src/python/ensembl/production/xrefs/parsers/VGNCParser.py +++ b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py @@ -14,18 +14,51 @@ """Parser module for VGNC source (uses HGNC Parser as parent).""" -from ensembl.production.xrefs.parsers.HGNCParser import * +import csv +from typing import Dict, Any, Tuple +from sqlalchemy.engine import Connection +from ensembl.production.xrefs.parsers.HGNCParser import HGNCParser class VGNCParser(HGNCParser): def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") + + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") + + # Open the VGNC file + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"VGNC file is empty") + file_io.seek(0) + + csv_reader = csv.DictReader(file_io, delimiter="\t") + + # Check if header has required columns + required_columns = [ + "taxon_id", + "ensembl_gene_id", + "vgnc_id", + "symbol", + "name", + "alias_symbol", + "prev_symbol", + ] + if not set(required_columns).issubset(set(csv_reader.fieldnames)): + raise ValueError(f"Can't find required columns in VGNC file '{xref_file}'") + + count, syn_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi) + + result_message = f"Loaded a total of {count} VGNC xrefs and added {syn_count} synonyms" - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + return 0, result_message + + def process_lines(self, csv_reader: csv.DictReader, source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int]: + count, syn_count = 0, 0 # Create a hash of all valid taxon_ids for this species species_id_to_tax = self.species_id_to_taxonomy(xref_dbi) @@ -34,46 +67,30 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: tax_ids = species_id_to_tax[species_id] tax_to_species_id = {tax_id: species_id for tax_id in tax_ids} - # Open the vgnc file - file_io = self.get_filehandle(file) - csv_reader = csv.DictReader(file_io, delimiter="\t") - - # Check if header has required columns - required_columns = [ - "taxon_id", - "ensembl_gene_id", - "vgnc_id", - "symbol", - "name", - "alias_symbol", - "prev_symbol", - ] - if not set(required_columns).issubset(set(csv_reader.fieldnames)): - raise IOError(f"Can't find required columns in VGNC file '{file}'") - # Read lines - count = 0 for line in csv_reader: + tax_id = int(line["taxon_id"]) # Skip data for other species - if not tax_to_species_id.get(line["taxon_id"]): + if not tax_to_species_id.get(tax_id): continue - # Add ensembl direct xref + # Add Ensembl direct xref if line["ensembl_gene_id"]: - self.add_to_direct_xrefs( + xref_id = self.add_xref( { - "stable_id": line["ensembl_gene_id"], - "ensembl_type": "gene", "accession": line["vgnc_id"], "label": line["symbol"], "description": line["name"], "source_id": source_id, "species_id": species_id, + "info_type": "DIRECT", }, xref_dbi, ) - - self.add_synonyms_for_hgnc( + self.add_direct_xref(xref_id, line["ensembl_gene_id"], "gene", "", xref_dbi) + + # Add synonyms + syn_count += self.add_synonyms_for_hgnc( { "source_id": source_id, "name": line["vgnc_id"], @@ -85,9 +102,5 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: ) count += 1 - - file_io.close() - - result_message = f"Loaded a total of {count} VGNC xrefs" - - return 0, result_message + + return count, syn_count \ No newline at end of file diff --git a/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py index 38c8ccbda..91d37397b 100644 --- a/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py +++ b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py @@ -14,63 +14,67 @@ """Parser module for Xenbase source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import csv +import re +from typing import Any, Dict, Tuple +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class XenopusJamboreeParser(BaseParser): + DESC_PROVENANCE_PATTERN = re.compile(r"\s*\[.*\]", re.IGNORECASE | re.DOTALL) + DESC_LABEL_PATTERN = re.compile(r",\s+\d+\s+of\s+\d+", re.IGNORECASE | re.DOTALL) + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") count = 0 - file_io = self.get_filehandle(file) - csv_reader = csv.reader(file_io, delimiter="\t") - - # Read lines - for line in csv_reader: - accession = line[0] - label = line[1] - desc = line[2] - stable_id = line[3] - - # If there is a description, trim it a bit - if desc: - desc = self.parse_description(desc) - - if label == "unnamed": - label = accession - - self.add_to_direct_xrefs( - { - "stable_id": stable_id, - "ensembl_type": "gene", - "accession": accession, - "label": label, - "description": desc, - "source_id": source_id, - "species_id": species_id, - }, - xref_dbi, - ) - count += 1 - - file_io.close() - - result_message = f"{count} XenopusJamboreeParser xrefs succesfully parsed" - + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"XenopusJamboree file is empty") + file_io.seek(0) + + csv_reader = csv.reader(file_io, delimiter="\t") + + # Read lines + for line in csv_reader: + accession, label, desc, stable_id = line[:4] + + # If there is a description, trim it a bit + if desc: + desc = self.parse_description(desc) + + if label == "unnamed": + label = accession + + xref_id = self.add_xref( + { + "accession": accession, + "label": label, + "description": desc, + "source_id": source_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, stable_id, "gene", "", xref_dbi) + count += 1 + + result_message = f"{count} XenopusJamboree xrefs successfully parsed" return 0, result_message def parse_description(self, description: str) -> str: # Remove some provenance information encoded in the description - description = re.sub(r"\s*\[.*\]", "", description) + description = self.DESC_PROVENANCE_PATTERN.sub("", description) # Remove labels of type 5 of 14 from the description - description = re.sub(r",\s+\d+\s+of\s+\d+", "", description) + description = self.DESC_LABEL_PATTERN.sub("", description) return description diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py index 4e703788a..670a03dcc 100644 --- a/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py @@ -14,49 +14,53 @@ """Parser module for ZFIN Descriptions.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import csv +import re +from typing import Any, Dict, Tuple +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class ZFINDescParser(BaseParser): + WITHDRAWN_PATTERN = re.compile(r"^WITHDRAWN:", re.IGNORECASE | re.DOTALL) + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") count = 0 withdrawn = 0 - file_io = self.get_filehandle(file) - csv_reader = csv.DictReader(file_io, delimiter="\t") - csv_reader.fieldnames = ["zfin", "desc", "label", "extra1", "extra2"] - - # Read lines - for line in csv_reader: - # Skip if WITHDRAWN: this precedes both desc and label - if re.search(r"\A WITHDRAWN:", line["label"]): - withdrawn += 1 - else: - xref_id = self.add_xref( - { - "accession": line["zfin"], - "label": line["label"], - "description": line["desc"], - "source_id": source_id, - "species_id": species_id, - "info_type": "MISC", - }, - xref_dbi, - ) - count += 1 - - file_io.close() - - result_message = ( - f"{count} ZFINDesc xrefs added, {withdrawn} withdrawn entries ignored" - ) + with self.get_filehandle(xref_file) as file_io: + if file_io.read(1) == '': + raise IOError(f"ZFINDesc file is empty") + file_io.seek(0) + + csv_reader = csv.DictReader(file_io, delimiter="\t") + csv_reader.fieldnames = ["zfin", "desc", "label", "extra1", "extra2"] + + # Read lines + for line in csv_reader: + # Skip if WITHDRAWN: this precedes both desc and label + if self.WITHDRAWN_PATTERN.search(line["label"]): + withdrawn += 1 + else: + self.add_xref( + { + "accession": line["zfin"], + "label": line["label"], + "description": line["desc"], + "source_id": source_id, + "species_id": species_id, + "info_type": "MISC", + }, + xref_dbi, + ) + count += 1 + result_message = f"{count} ZFINDesc xrefs added, {withdrawn} withdrawn entries ignored" return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py index 8734d62ca..2792af8ff 100644 --- a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py @@ -14,127 +14,134 @@ """Parser module for ZFIN source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import os +import csv +import re +import unicodedata +from typing import Dict, Any, Tuple +from sqlalchemy import select +from ensembl.xrefs.xref_update_db_model import ( + Source as SourceUORM, + Xref as XrefUORM, +) + +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class ZFINParser(BaseParser): + REFSEQ_ACC_PATTERN = re.compile(r"^X[PMR]_") + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + xref_file = args.get("file") + xref_dbi = args.get("xref_dbi") - if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + if not source_id or not species_id or not xref_file: + raise AttributeError("Missing required arguments: source_id, species_id, and file") # Get the ZFIN source ids - direct_src_id = self.get_source_id_for_source_name( - "ZFIN_ID", xref_dbi, "direct" - ) - dependent_src_id = self.get_source_id_for_source_name( - "ZFIN_ID", xref_dbi, "uniprot/refseq" - ) - description_src_id = self.get_source_id_for_source_name( - "ZFIN_ID", xref_dbi, "description_only" - ) + direct_src_id = self.get_source_id_for_source_name("ZFIN_ID", xref_dbi, "direct") + dependent_src_id = self.get_source_id_for_source_name("ZFIN_ID", xref_dbi, "uniprot/refseq") + description_src_id = self.get_source_id_for_source_name("ZFIN_ID", xref_dbi, "description_only") # Get the ZFIN descriptions - description = {} - query = select(XrefUORM.accession, XrefUORM.description).where( - XrefUORM.source_id == description_src_id - ) + descriptions = {} + query = select(XrefUORM.accession, XrefUORM.description).where(XrefUORM.source_id == description_src_id) for row in xref_dbi.execute(query).mappings().all(): if row.description: - description[row.accession] = row.description + descriptions[row.accession] = row.description # Get the Uniprot and RefSeq accessions swiss = self.get_valid_codes("uniprot/swissprot", species_id, xref_dbi) refseq = self.get_valid_codes("refseq", species_id, xref_dbi) - file_dir = os.path.dirname(file) + file_dir = os.path.dirname(xref_file) counts = {"direct": 0, "uniprot": 0, "refseq": 0, "synonyms": 0, "mismatch": 0} # Process ZFIN to ensEMBL mappings zfin = {} - zfin_io = self.get_filehandle(os.path.join(file_dir, "ensembl_1_to_1.txt")) - zfin_csv_reader = csv.DictReader(zfin_io, delimiter="\t", strict=True) - zfin_csv_reader.fieldnames = ["zfin", "so", "label", "ensembl_id"] - for line in zfin_csv_reader: - self.add_to_direct_xrefs( - { - "stable_id": line["ensembl_id"], - "ensembl_type": "gene", - "accession": line["zfin"], - "label": line["label"], - "description": description.get(line["zfin"]), - "source_id": direct_src_id, - "species_id": species_id, - }, - xref_dbi, - ) - - zfin[line["zfin"]] = 1 - counts["direct"] += 1 - - zfin_io.close() + with self.get_filehandle(os.path.join(file_dir, "ensembl_1_to_1.txt")) as zfin_io: + if zfin_io.read(1) == '': + raise IOError(f"ZFIN Ensembl file is empty") + zfin_io.seek(0) + + zfin_csv_reader = csv.DictReader(zfin_io, delimiter="\t", strict=True) + zfin_csv_reader.fieldnames = ["zfin", "so", "label", "ensembl_id"] + for line in zfin_csv_reader: + xref_id = self.add_xref( + { + "accession": line["zfin"], + "label": line["label"], + "description": descriptions.get(line["zfin"]), + "source_id": direct_src_id, + "species_id": species_id, + "info_type": "DIRECT", + }, + xref_dbi, + ) + self.add_direct_xref(xref_id, line["ensembl_id"], "gene", "", xref_dbi) - # Process ZFIN to Uniprot mappings - swissprot_io = self.get_filehandle(os.path.join(file_dir, "uniprot.txt")) - swissprot_csv_reader = csv.DictReader(swissprot_io, delimiter="\t", strict=True) - swissprot_csv_reader.fieldnames = ["zfin", "so", "label", "acc"] - for line in swissprot_csv_reader: - if swiss.get(line["acc"]) and not zfin.get(line["zfin"]): - for xref_id in swiss[line["acc"]]: - self.add_dependent_xref( - { - "master_xref_id": xref_id, - "accession": line["zfin"], - "label": line["label"], - "description": description.get(line["zfin"]), - "source_id": dependent_src_id, - "species_id": species_id, - }, - xref_dbi, - ) - counts["uniprot"] += 1 - else: - counts["mismatch"] += 1 + zfin[line["zfin"]] = True + counts["direct"] += 1 - swissprot_io.close() + # Process ZFIN to Uniprot mappings + with self.get_filehandle(os.path.join(file_dir, "uniprot.txt")) as swissprot_io: + if swissprot_io.read(1) == '': + raise IOError(f"ZFIN Uniprot file is empty") + swissprot_io.seek(0) + + swissprot_csv_reader = csv.DictReader(swissprot_io, delimiter="\t", strict=True) + swissprot_csv_reader.fieldnames = ["zfin", "so", "label", "acc"] + for line in swissprot_csv_reader: + if swiss.get(line["acc"]) and not zfin.get(line["zfin"]): + for xref_id in swiss[line["acc"]]: + self.add_dependent_xref( + { + "master_xref_id": xref_id, + "accession": line["zfin"], + "label": line["label"], + "description": descriptions.get(line["zfin"]), + "source_id": dependent_src_id, + "species_id": species_id, + }, + xref_dbi, + ) + counts["uniprot"] += 1 + else: + counts["mismatch"] += 1 # Process ZFIN to RefSeq mappings - refseq_io = self.get_filehandle(os.path.join(file_dir, "refseq.txt")) - refseq_csv_reader = csv.DictReader(refseq_io, delimiter="\t", strict=True) - refseq_csv_reader.fieldnames = ["zfin", "so", "label", "acc"] - for line in refseq_csv_reader: - # Ignore mappings to predicted RefSeq - if ( - re.search(r"^XP_", line["acc"]) - or re.search(r"^XM_", line["acc"]) - or re.search(r"^XR_", line["acc"]) - ): - continue - - if refseq.get(line["acc"]) and not zfin.get(line["zfin"]): - for xref_id in refseq[line["acc"]]: - self.add_dependent_xref( - { - "master_xref_id": xref_id, - "accession": line["zfin"], - "label": line["label"], - "description": description.get(line["zfin"]), - "source_id": source_id, - "species_id": species_id, - }, - xref_dbi, - ) - counts["refseq"] += 1 - else: - counts["mismatch"] += 1 - - refseq_io.close() - - # Get the added ZFINs added + with self.get_filehandle(os.path.join(file_dir, "refseq.txt")) as refseq_io: + if refseq_io.read(1) == '': + raise IOError(f"ZFIN Refseq file is empty") + refseq_io.seek(0) + + refseq_csv_reader = csv.DictReader(refseq_io, delimiter="\t", strict=True) + refseq_csv_reader.fieldnames = ["zfin", "so", "label", "acc"] + for line in refseq_csv_reader: + # Ignore mappings to predicted RefSeq + if self.REFSEQ_ACC_PATTERN.search(line["acc"]): + continue + + if refseq.get(line["acc"]) and not zfin.get(line["zfin"]): + for xref_id in refseq[line["acc"]]: + self.add_dependent_xref( + { + "master_xref_id": xref_id, + "accession": line["zfin"], + "label": line["label"], + "description": descriptions.get(line["zfin"]), + "source_id": dependent_src_id, + "species_id": species_id, + }, + xref_dbi, + ) + counts["refseq"] += 1 + else: + counts["mismatch"] += 1 + + # Get the added ZFINs zfin = self.get_valid_codes("zfin", species_id, xref_dbi) sources = [] @@ -143,27 +150,31 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: sources.append(row[0]) # Process the synonyms - aliases_io = self.get_filehandle(os.path.join(file_dir, "aliases.txt")) - aliases_csv_reader = csv.DictReader(aliases_io, delimiter="\t", strict=True) - aliases_csv_reader.fieldnames = ["acc", "cur_name", "cur_symbol", "syn", "so"] - for line in aliases_csv_reader: - if zfin.get(line["acc"]): - synonym = ( - unicodedata.normalize("NFKD", line["syn"]) - .encode("ascii", "namereplace") - .decode("ascii") - ) - self.add_to_syn_for_mult_sources( - line["acc"], sources, synonym, species_id, xref_dbi - ) - counts["synonyms"] += 1 - - aliases_io.close() - - result_message = f"{counts['direct']} direct ZFIN xrefs added and\n" - result_message += f"\t{counts['uniprot']} dependent xrefs from UniProt added\n" - result_message += f"\t{counts['refseq']} dependent xrefs from RefSeq added\n" - result_message += f"\t{counts['mismatch']} dependents ignored\n" - result_message += f"\t{counts['synonyms']} synonyms loaded" + with self.get_filehandle(os.path.join(file_dir, "aliases.txt")) as aliases_io: + if aliases_io.read(1) == '': + raise IOError(f"ZFIN Aliases file is empty") + aliases_io.seek(0) + + aliases_csv_reader = csv.DictReader(aliases_io, delimiter="\t", strict=True) + aliases_csv_reader.fieldnames = ["acc", "cur_name", "cur_symbol", "syn", "so"] + for line in aliases_csv_reader: + if zfin.get(line["acc"]): + synonym = ( + unicodedata.normalize("NFKD", line["syn"]) + .encode("ascii", "namereplace") + .decode("ascii") + ) + self.add_to_syn_for_mult_sources( + line["acc"], sources, synonym, species_id, xref_dbi + ) + counts["synonyms"] += 1 + + result_message = ( + f"{counts['direct']} direct ZFIN xrefs added and\n" + f"\t{counts['uniprot']} dependent xrefs from UniProt added\n" + f"\t{counts['refseq']} dependent xrefs from RefSeq added\n" + f"\t{counts['mismatch']} dependents ignored\n" + f"\t{counts['synonyms']} synonyms loaded" + ) return 0, result_message diff --git a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py index dcba51ccb..cc90ea85c 100644 --- a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py +++ b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py @@ -14,19 +14,26 @@ """Parser module for miRBase source.""" -from ensembl.production.xrefs.parsers.BaseParser import * +import re +from typing import Any, Dict, List, Tuple +from ensembl.production.xrefs.parsers.BaseParser import BaseParser class miRBaseParser(BaseParser): + NAME_PATTERN = re.compile(r"^ID\s+(\S+)\s+", re.MULTILINE) + ACCESSION_PATTERN = re.compile(r"^AC\s+(\S+);\s+", re.MULTILINE) + DESCRIPTION_PATTERN = re.compile(r"^DE\s+(.*)", re.MULTILINE) + SPECIES_NAME_PATTERN = re.compile(r"(.+?)\s+stem(-|\s)loop") + def run(self, args: Dict[str, Any]) -> Tuple[int, str]: - source_id = args["source_id"] - species_id = args["species_id"] - species_name = args["species_name"] - file = args["file"] - xref_dbi = args["xref_dbi"] + source_id = args.get("source_id") + species_id = args.get("species_id") + species_name = args.get("species_name") + file = args.get("file") + xref_dbi = args.get("xref_dbi") if not source_id or not species_id or not file: - raise AttributeError("Need to pass source_id, species_id and file as pairs") + raise AttributeError("Missing required arguments: source_id, species_id, and file") # Get the species name(s) species_to_names = self.species_id_to_names(xref_dbi) @@ -44,69 +51,61 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: self.upload_xref_object_graphs(xrefs, xref_dbi) - result_message = "Read %d xrefs from %s" % (len(xrefs), file) - + result_message = f"Read {len(xrefs)} xrefs from {file}" return 0, result_message def create_xrefs(self, source_id: int, file: str, species_id: int, name_to_species_id: Dict[str, int]) -> List[Dict[str, Any]]: xrefs = [] - # Read mirbase file + # Read miRBase file for section in self.get_file_sections(file, "//\n"): - if len(section) == 1: - continue - entry = "".join(section) if not entry: continue - xref = {} - - (header, sequence) = re.split(r"\nSQ", entry, 2) + header, sequence = re.split(r"\nSQ", entry, 1) species = None # Extract sequence if sequence: - seq_lines = sequence.split("\n") - seq_lines.pop(0) - - sequence = "".join(seq_lines) - sequence = sequence.upper() - sequence = re.sub("U", "T", sequence) - sequence = re.sub(r"[\d+,\s+]", "", sequence) + seq_lines = sequence.split("\n")[1:] # Remove newlines and drop the information line + sequence = "".join(seq_lines).upper() # Join into a single string and convert to uppercase + sequence = re.sub("U", "T", sequence) # Replace Us with Ts + sequence = re.sub(r"[\d+\s,]", "", sequence) # Remove digits, spaces, and commas # Extract name, accession, and description - name = re.search(r"^ID\s+(\S+)\s+", header, flags=re.MULTILINE).group(1) - accession = re.search(r"^AC\s+(\S+);\s+", header, flags=re.MULTILINE).group( - 1 - ) - description = re.search( - r"^DE\s+(.+)\s+stem(-|\s)loop", header, flags=re.MULTILINE - ).group(1) - - # Format description and extract species name - if description: - description_parts = re.split(r"\s+", description) - description_parts.pop() - species = " ".join(description_parts) - species = species.lower() - species = re.sub(" ", "_", species) + name_match = self.NAME_PATTERN.search(header) + accession_match = self.ACCESSION_PATTERN.search(header) + description_match = self.DESCRIPTION_PATTERN.search(header) + + if not (accession_match and description_match): + continue + + name = name_match.group(1) + accession = accession_match.group(1) + description = description_match.group(1) + + # Extract species name from description + species_name_match = self.SPECIES_NAME_PATTERN.search(description) + species = species_name_match.group(1) + species = "_".join(species.split()[:-1]).lower() # If no species match, skip to next record species_id_check = name_to_species_id.get(species) if not species_id_check: continue - if species_id and species_id == species_id_check: + if species_id == species_id_check: xref = { "SEQUENCE_TYPE": "dna", "STATUS": "experimental", "SOURCE_ID": source_id, "ACCESSION": accession, "LABEL": name, - "DESCRIPTION": name, + "DESCRIPTION": description, "SEQUENCE": sequence, "SPECIES_ID": species_id, + "INFO_TYPE": "SEQUENCE_MATCH", } xrefs.append(xref) diff --git a/src/python/test/xrefs/__init__.py b/src/python/test/xrefs/__init__.py new file mode 100644 index 000000000..b82a66b8a --- /dev/null +++ b/src/python/test/xrefs/__init__.py @@ -0,0 +1,15 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Xref tests.""" diff --git a/src/python/test/xrefs/conftest.py b/src/python/test/xrefs/conftest.py new file mode 100644 index 000000000..36b690013 --- /dev/null +++ b/src/python/test/xrefs/conftest.py @@ -0,0 +1,135 @@ +import pytest +import os +import io +import re + +from datetime import datetime +from unittest.mock import MagicMock +from typing import Any, Generator, Callable + +from ensembl.utils.database import UnitTestDB, DBConnection +from ensembl.xrefs.xref_update_db_model import Base +from ensembl.production.xrefs.parsers.BaseParser import BaseParser + +# Fixture to set up a test database +@pytest.fixture(scope="module") +def test_db() -> Generator[None, None, None]: + # Create a unique database name using the current user and timestamp + user = os.environ.get("USER", "testuser") + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + db_name = f"{user}_test_xref_{timestamp}" + mysql_url = f"mysql+pymysql://ensadmin:ensembl@mysql-ens-core-prod-1.ebi.ac.uk:4524/{db_name}" + + # Create all tables defined in the Base metadata + with UnitTestDB(mysql_url, metadata=Base.metadata, name=db_name) as test_db: + yield test_db + +# Fixture to connect to the test database and close connection when done +@pytest.fixture +def mock_xref_dbi(test_db: UnitTestDB) -> Generator[Any, None, None]: + conn = test_db.dbc.connect() + yield conn + conn.close() + +# Common test for missing source_id +@pytest.fixture +def test_no_source_id() -> Callable[[BaseParser, int], None]: + def _test_no_source_id(parser_instance: BaseParser, species_id: int = 9606) -> None: + with pytest.raises( + AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?" + ): + parser_instance.run( + { + "species_id": species_id, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + return _test_no_source_id + +# Common test for missing species_id +@pytest.fixture +def test_no_species_id() -> Callable[[BaseParser, int], None]: + def _test_no_species_id(parser_instance: BaseParser, source_id: int = 1) -> None: + with pytest.raises( + AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?" + ): + parser_instance.run( + { + "source_id": source_id, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + return _test_no_species_id + +# Common test for missing file +@pytest.fixture +def test_no_file() -> Callable[[BaseParser, int, int], None]: + def _test_no_file(parser_instance: BaseParser, source_id: int = 1, species_id: int = 9606) -> None: + with pytest.raises( + AttributeError, match="Missing required arguments: source_id, species_id, and file" + ): + parser_instance.run( + { + "source_id": source_id, + "species_id": species_id, + "xref_dbi": MagicMock(), + } + ) + return _test_no_file + +# Common test for file not found +@pytest.fixture +def test_file_not_found() -> Callable[[BaseParser, int, int], None]: + def _test_file_not_found(parser_instance: BaseParser, source_id: int = 1, species_id: int = 9606) -> None: + with pytest.raises(FileNotFoundError, match=f"Could not find either"): + parser_instance.run( + { + "source_id": source_id, + "species_id": species_id, + "file": "flatfiles/non_existent_file.txt", + "xref_dbi": MagicMock(), + } + ) + return _test_file_not_found + +# Common test for empty file +@pytest.fixture +def test_empty_file() -> Callable[[BaseParser, str, int, int], None]: + def _test_empty_file(parser_instance: BaseParser, source_name: str, source_id: int = 1, species_id: int = 9606) -> None: + mock_file = io.StringIO("") + parser_instance.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(IOError, match=f"{source_name} file is empty"): + parser_instance.run( + { + "source_id": source_id, + "species_id": species_id, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + return _test_empty_file + +@pytest.fixture +def test_missing_required_source_id() -> Callable[[BaseParser, DBConnection, str, int, int, str], None]: + def _test_missing_required_source_id(parser_instance: BaseParser, mock_dbi: DBConnection, source_name: str, source_id: int = 1, species_id: int = 9606, priority_desc: str = None) -> None: + mock_file = io.StringIO("test file") + parser_instance.get_filehandle = MagicMock(return_value=mock_file) + + if priority_desc is not None: + source_name = f"{source_name} ({priority_desc})" + + with pytest.raises( + KeyError, match=re.escape(f"No source_id for source_name={source_name}") + ): + parser_instance.run( + { + "source_id": source_id, + "species_id": species_id, + "file": "dummy_file.txt", + "xref_dbi": mock_dbi, + } + ) + return _test_missing_required_source_id \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/flatfiles/dbass3.txt b/src/python/test/xrefs/parsers/flatfiles/dbass3.txt new file mode 100644 index 000000000..b03169479 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/dbass3.txt @@ -0,0 +1,9 @@ +Id,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText +1,GNAS1,GNAS complex locus,ENSG00000087460,Hereditary osteodystrophy,103580,IVS7-1G>A,Exon,+1,+1,AAGCAGGCTGACTATGTGCCGAGCGATCAGgtgtgcaaaacccctccccaccagaggactctgagccctctttccaaactactccagacctttgctttagattggcaattattactgtttcggttggctttggtgagatccattgacctcaattttgtttca(g>a)G/ACCTGCTTCGCTGCCGTGTCCTGACTTCTGGAATCTTTGAGACCAAGTTCCAGGTGGACAAAGTCAACTTCCAgtaagccaactgt,False,,chr20:58909349,chr20:58909350/58909351,10.35,4.02,9.95,4.55,8.06,4.73,12624854,"Rickard & Wilson. (2003) Analysis of GNAS1 and overlapping transcripts identifies the parental origin of mutations in patients with sporadic Albright hereditary osteodystrophy and reveals a model system in which to observe the effects of splicing mutations on translated and untranslated messenger RNA. Am. J. Hum. Genet., 72, 961-974." +2,LDLR (LDLT),low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS9-30GTGCTGATGdelinsCGGCT,Exon,+54,0,TGGCCAGCAATAGAATCTACTGGTCTGACCTGTCCCAGAGAATGATCTGCAGgtgagcgtcgcccctgcctgcagccttggcccgcaggtgaGatgagggctcctg(gcgctgatg>cggct)cccttctctcctcctgcctcagCACCCAGCTTGACAGAGCCCACGGCGTCTCTTCCTATGACACCGTCATCAGCAG/AGACATCCAGGCCCCCGACGGGCTGGCTGTGGACTGGATCCACAGCAACATCTACTGGACCGACTCTGTCCTGGGCACTGTCTCTGTTGCGGATACCAAG/GGCGTGAAGAGGAAAACGTTATTCAGGGAGAACGGCTCCAAGCCAAGGGCCATCGTGGTGGATCCTGTTCATGGgtgcgtatccacgacgctgagg,False,,chr19:11113504,"chr19:11113588/11113589, chr19:11113688/11113689",6.76,2.79,9.02,0.95,11.67,1.22,8872473,"Webb et al. (1996) Genetic variation at a splicing branch point in intron 9 of the low density lipoprotein (LDL)-receptor gene: a rare mutation that disrupts mRNA splicing in a patient with familial hypercholesterolaemia and a common polymorphism. Hum. Mol. Genet., 5, 1325-1331." +3,LDLR(LDLT),low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS1-1G>C,Exon,+10,+1,tgttcctgatcggatgacatttctggttaattctttagttggcaggaaatagacacaggaaacgtggtcagtttctgattctggcgttgagagaccctttctccttttcctctctctca(g>c)TGGGCGACAG/ATGCGAAAGAAACGAGTTCCAGTGCCAAGACGGGAAATGCATCTCCTACAAGTGGGTCTGCGATGGCAGCGCTGAGTGCCAGGATGGCTCTGATGAGTCCCAGGAGACGTGCTgtgagtcccctt,False,,chr19:11100222,chr19:11100232/11100233,9.99,5.28,9.98,4.24,14.12,4.06,10200052,"Maruyama et al. (1998) A novel point mutation in a splice acceptor site of intron 1 of the human low density lipoprotein receptor gene which causes severe hypercholesterolemia: an unexpected absence of exon skipping. Mutations in brief no. 139. Online. Hum. Mutat., 11, 480-481." +4,LDLR/LDLT,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS7-1G>C,Exon,+17,+2,ctccgtctctagccattggggaagagcctccccaccaagcctctttctctctcttcca(g>c)ATATCGATGAGTGTCAG/GATCCCGACACCTGCAGCCAGCTCTGCGTGAACCTGGAGGGTGGCTACAAGTGCCAGTGTGAGGAAGGCTTCCAGCTGGACCCCCACACGAAGGCCTGCAAGGCTGTGGgtgagcacgggaa,False,,chr19:11111513,chr19:11111530/11111531,13.90,-3.91,13.40,-5.28,16.53,-1.77,10487495,"Yu et al. (1999) Familial hypercholesterolemia. Acceptor splice site (G-->C) mutation in intron 7 of the LDL-R gene: alternate RNA editing causes exon 8 skipping or a premature stop codon in exon 8. LDL-R(Honduras-1) [LDL-R1061(-1) G-->C]. Atherosclerosis, 146, 125-131." +5,COL1A2,"collagen, type I, alpha 2",ENSG00000164692,Ehlers-Danlos syndrome,"120160, 120150",IVS5-1G>C,Exon,+15,0,ttgccctcttttaaataacaacagaaaaatatttacaagtagaatgagaaaatgaactacatgactagtaactaaaaatattttatatatatatataattttttttttttacttctcta(g>c)AACTTTGCTGCTCAG/TATGATGGAAAAGGAGTTGGACTTGGCCCTGGACCAATGgtatgcttatctgt,False,,chr7:94401566,chr7:94401581/94401582,10.62,2.87,11.17,3.61,14.81,4.66,1556139,"Chiodo et al. (1992) A base substitution at the splice acceptor site of intron 5 of the COL1A2 gene activates a cryptic splice site within exon 6 and generates abnormal type I procollagen in a patient with Ehlers-Danlos syndrome type VII. J. Biol. Chem., 267, 6361-6369." +6,COL5A1,"collagen, type V, alpha 1",,Ehlers-Danlos syndrome,"120160, 120150",IVS4-2A>G,Exon,+12,0,cctagcttgagtgtcttttgtgagtggcagcttctagggagaatgtttggctctgaggacaagctcgtcttgtggcttggtctggactttcccctgcttcaaggcatggggctgtgtctcccaggtccccatgcgagtgctctgtgagctgctttttcatgagcgtctcttcttttcc(a>g)gGGTGACATCCAG/CAG/CTGCTCTTTGTCTCGGACCACCGGGCAGCTTATGATTACTGTGAGCACTACAGCCCTGACTGTGACACCGCAGTACCTGACACCCCACAGTCGCAGGACCCCAATCCAGATGAATATgtgagttaactctggc,False,,chr9:134727264,"chr9:134727277/134727278, chr9:134727280/134727281",9.58,1.45,9.51,0.45,12.17,1.95,12145749,"Takahara et al. (2002) Order of intron removal influences multiple splice outcomes, including a two-exon skip, in a COL5A1 acceptor-site mutation that results in abnormal pro-alpha1(V) N-propeptides and Ehlers-Danlos syndrome type I. Am. J. Hum. Genet., 71, 451-465." +7,COL5A1,"collagen, type V, alpha 1",ENSG00000130635,Ehlers-Danlos syndrome,"120160, 120150",IVS4-2A>G,Exon,+12,0,cctagcttgagtgtcttttgtgagtggcagcttctagggagaatgtttggctctgaggacaagctcgtcttgtggcttggtctggactttcccctgcttcaaggcatggggctgtgtctcccaggtccccatgcgagtgctctgtgagctgctttttcatgagcgtctcttcttttcc(a>g)gGGTGACATCCAG/CAG/CTGCTCTTTGTCTCGGACCACCGGGCAGCTTATGATTACTGTGAGCACTACAGCCCTGACTGTGACACCGCAGTACCTGACACCCCACAGTCGCAGGACCCCAATCCAGATGAATATgtgagttaactctggc,False,,chr9:134727264,"chr9:134727277/134727278, chr9:134727280/134727281",9.58,1.45,9.51,0.45,12.17,1.95,12145749,"Takahara et al. (2002) Order of intron removal influences multiple splice outcomes, including a two-exon skip, in a COL5A1 acceptor-site mutation that results in abnormal pro-alpha1(V) N-propeptides and Ehlers-Danlos syndrome type I. Am. J. Hum. Genet., 71, 451-465." + diff --git a/src/python/test/xrefs/parsers/flatfiles/dbass5.txt b/src/python/test/xrefs/parsers/flatfiles/dbass5.txt new file mode 100644 index 000000000..d0ce0f459 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/dbass5.txt @@ -0,0 +1,8 @@ +Id,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,MaximumDependenceDecompositionModelAuthentic,MaximumDependenceDecompositionModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText +1,GNAS1,GNAS complex locus,ENSG00000087460,Hereditary osteodystrophy,103580,IVS7-1G>A,Exon,+1,+1,AAGCAGGCTGACTATGTGCCGAGCGATCAGgtgtgcaaaacccctccccaccagaggactctgagccctctttccaaactactccagacctttgctttagattggcaattattactgtttcggttggctttggtgagatccattgacctcaattttgtttca(g>a)G/ACCTGCTTCGCTGCCGTGTCCTGACTTCTGGAATCTTTGAGACCAAGTTCCAGGTGGACAAAGTCAACTTCCAgtaagccaactgt,False,,chr20:58909349,chr20:58909350/58909351,10.35,4.02,9.95,4.55,8.06,4.73,12624854,"Rickard & Wilson. (2003) Analysis of GNAS1 and overlapping transcripts identifies the parental origin of mutations in patients with sporadic Albright hereditary osteodystrophy and reveals a model system in which to observe the effects of splicing mutations on translated and untranslated messenger RNA. Am. J. Hum. Genet., 72, 961-974." +2,LDLR,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS9-30GTGCTGATGdelinsCGGCT,Exon,+54,0,TGGCCAGCAATAGAATCTACTGGTCTGACCTGTCCCAGAGAATGATCTGCAGgtgagcgtcgcccctgcctgcagccttggcccgcaggtgaGatgagggctcctg(gcgctgatg>cggct)cccttctctcctcctgcctcagCACCCAGCTTGACAGAGCCCACGGCGTCTCTTCCTATGACACCGTCATCAGCAG/AGACATCCAGGCCCCCGACGGGCTGGCTGTGGACTGGATCCACAGCAACATCTACTGGACCGACTCTGTCCTGGGCACTGTCTCTGTTGCGGATACCAAG/GGCGTGAAGAGGAAAACGTTATTCAGGGAGAACGGCTCCAAGCCAAGGGCCATCGTGGTGGATCCTGTTCATGGgtgcgtatccacgacgctgagg,False,,chr19:11113504,"chr19:11113588/11113589, chr19:11113688/11113689",6.76,2.79,9.02,0.95,11.67,1.22,8872473,"Webb et al. (1996) Genetic variation at a splicing branch point in intron 9 of the low density lipoprotein (LDL)-receptor gene: a rare mutation that disrupts mRNA splicing in a patient with familial hypercholesterolaemia and a common polymorphism. Hum. Mol. Genet., 5, 1325-1331." +3,LDLR,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS1-1G>C,Exon,+10,+1,tgttcctgatcggatgacatttctggttaattctttagttggcaggaaatagacacaggaaacgtggtcagtttctgattctggcgttgagagaccctttctccttttcctctctctca(g>c)TGGGCGACAG/ATGCGAAAGAAACGAGTTCCAGTGCCAAGACGGGAAATGCATCTCCTACAAGTGGGTCTGCGATGGCAGCGCTGAGTGCCAGGATGGCTCTGATGAGTCCCAGGAGACGTGCTgtgagtcccctt,False,,chr19:11100222,chr19:11100232/11100233,9.99,5.28,9.98,4.24,14.12,4.06,10200052,"Maruyama et al. (1998) A novel point mutation in a splice acceptor site of intron 1 of the human low density lipoprotein receptor gene which causes severe hypercholesterolemia: an unexpected absence of exon skipping. Mutations in brief no. 139. Online. Hum. Mutat., 11, 480-481." +4,LDLR,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS7-1G>C,Exon,+17,+2,ctccgtctctagccattggggaagagcctccccaccaagcctctttctctctcttcca(g>c)ATATCGATGAGTGTCAG/GATCCCGACACCTGCAGCCAGCTCTGCGTGAACCTGGAGGGTGGCTACAAGTGCCAGTGTGAGGAAGGCTTCCAGCTGGACCCCCACACGAAGGCCTGCAAGGCTGTGGgtgagcacgggaa,False,,chr19:11111513,chr19:11111530/11111531,13.90,-3.91,13.40,-5.28,16.53,-1.77,10487495,"Yu et al. (1999) Familial hypercholesterolemia. Acceptor splice site (G-->C) mutation in intron 7 of the LDL-R gene: alternate RNA editing causes exon 8 skipping or a premature stop codon in exon 8. LDL-R(Honduras-1) [LDL-R1061(-1) G-->C]. Atherosclerosis, 146, 125-131." +5,COL1A2,"collagen, type I, alpha 2",ENSG00000164692,Ehlers-Danlos syndrome,"120160, 120150",IVS5-1G>C,Exon,+15,0,ttgccctcttttaaataacaacagaaaaatatttacaagtagaatgagaaaatgaactacatgactagtaactaaaaatattttatatatatatataattttttttttttacttctcta(g>c)AACTTTGCTGCTCAG/TATGATGGAAAAGGAGTTGGACTTGGCCCTGGACCAATGgtatgcttatctgt,False,,chr7:94401566,chr7:94401581/94401582,10.62,2.87,11.17,3.61,14.81,4.66,1556139,"Chiodo et al. (1992) A base substitution at the splice acceptor site of intron 5 of the COL1A2 gene activates a cryptic splice site within exon 6 and generates abnormal type I procollagen in a patient with Ehlers-Danlos syndrome type VII. J. Biol. Chem., 267, 6361-6369." +6,COL5A1,"collagen, type V, alpha 1",ENSG00000130635,Ehlers-Danlos syndrome,"120160, 120150",IVS4-2A>G,Exon,+12,0,cctagcttgagtgtcttttgtgagtggcagcttctagggagaatgtttggctctgaggacaagctcgtcttgtggcttggtctggactttcccctgcttcaaggcatggggctgtgtctcccaggtccccatgcgagtgctctgtgagctgctttttcatgagcgtctcttcttttcc(a>g)gGGTGACATCCAG/CAG/CTGCTCTTTGTCTCGGACCACCGGGCAGCTTATGATTACTGTGAGCACTACAGCCCTGACTGTGACACCGCAGTACCTGACACCCCACAGTCGCAGGACCCCAATCCAGATGAATATgtgagttaactctggc,False,,chr9:134727264,"chr9:134727277/134727278, chr9:134727280/134727281",9.58,1.45,9.51,0.45,12.17,1.95,12145749,"Takahara et al. (2002) Order of intron removal influences multiple splice outcomes, including a two-exon skip, in a COL5A1 acceptor-site mutation that results in abnormal pro-alpha1(V) N-propeptides and Ehlers-Danlos syndrome type I. Am. J. Hum. Genet., 71, 451-465." + diff --git a/src/python/test/xrefs/parsers/flatfiles/entrezgene.txt b/src/python/test/xrefs/parsers/flatfiles/entrezgene.txt new file mode 100644 index 000000000..bc07aa246 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/entrezgene.txt @@ -0,0 +1,13 @@ +#tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from_nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_date Feature_type +9606 1 A1BG - A1B|ABG|GAB|HYST2477 MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410 19 19q13.43 alpha-1-B glycoprotein protein-coding A1BG alpha-1-B glycoprotein O alpha-1B-glycoprotein|HEL-S-163pA|epididymis secretory sperm binding protein Li 163pA 20181208 - +9606 2 A2M - A2MD|CPAMD5|FWP007|S863-7 MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899 12 12p13.31 alpha-2-macroglobulin protein-coding A2M alpha-2-macroglobulin O alpha-2-macroglobulin|C3 and PZP-like alpha-2-macroglobulin domain-containing protein 5|alpha-2-M 20181208 - +9606 2 A2M - A2MD - 12 - alpha-2-macroglobulin protein-coding - - O - 20181208 - +9606 3 A2MP1 - A2MP HGNC:HGNC:8|Ensembl:ENSG00000256069 12 12p13.31 alpha-2-macroglobulin pseudogene 1 pseudo A2MP1 alpha-2-macroglobulin pseudogene 1 O pregnancy-zone protein pseudogene 20180329 - +9606 9 NAT1 - AAC1|MNAT|NAT-1|NATI MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428 8 8p22 N-acetyltransferase 1 protein-coding NAT1 N-acetyltransferase 1 O arylamine N-acetyltransferase 1|N-acetyltransferase 1 (arylamine N-acetyltransferase)|N-acetyltransferase type 1|arylamide acetylase 1|monomorphic arylamine N-acetyltransferase 20181207 - +9606 10 NAT2 - AAC2|NAT-2|PNAT MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006 8 8p22 N-acetyltransferase 2 protein-coding NAT2 N-acetyltransferase 2 O arylamine N-acetyltransferase 2|N-acetyltransferase 2 (arylamine N-acetyltransferase)|N-acetyltransferase type 2|arylamide acetylase 2 20181207 - +9606 11 NATP - AACP|NATP1 HGNC:HGNC:15 8 8p22 N-acetyltransferase pseudogene pseudo NATP N-acetyltransferase pseudogene O arylamide acetylase pseudogene 20180329 - +9606 12 SERPINA3 - AACT|ACT|GIG24|GIG25 MIM:107280|HGNC:HGNC:16|Ensembl:ENSG00000196136 14 14q32.13 serpin family A member 3 protein-coding SERPINA3 serpin family A member 3 O alpha-1-antichymotrypsin|cell growth-inhibiting gene 24/25 protein|growth-inhibiting protein 24|growth-inhibiting protein 25|serine (or cysteine) proteinase inhibitor, clade A, member 3|serpin A3|serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3 20181208 - +9606 13 AADAC - CES5A1|DAC MIM:600338|HGNC:HGNC:17|Ensembl:ENSG00000114771 3 3q25.1 arylacetamide deacetylase protein-coding AADAC arylacetamide deacetylase O arylacetamide deacetylase|arylacetamide deacetylase (esterase) 20181207 - +9606 14 AAMP - - MIM:603488|HGNC:HGNC:18|Ensembl:ENSG00000127837 2 2q35 angio associated migratory cell protein protein-coding AAMP angio associated migratory cell protein O angio-associated migratory cell protein 20181208 - +9606 15 AANAT - DSPS|SNAT MIM:600950|HGNC:HGNC:19|Ensembl:ENSG00000129673 17 17q25.1 aralkylamine N-acetyltransferase protein-coding AANAT aralkylamine N-acetyltransferase O serotonin N-acetyltransferase|arylalkylamine N-acetyltransferase|serotonin acetylase 20181209 - +9313 16 rpsP FUT79_RS10890 FUT79_10890 - - - 30S ribosomal protein S16 protein-coding - - - 30S ribosomal protein S16 20240427 - diff --git a/src/python/test/xrefs/parsers/flatfiles/hgnc.txt b/src/python/test/xrefs/parsers/flatfiles/hgnc.txt new file mode 100644 index 000000000..824587793 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/hgnc.txt @@ -0,0 +1,21 @@ +HGNC ID Approved symbol Approved name Previous symbols Alias symbols NCBI Gene ID Ensembl gene ID RefSeq IDs CCDS IDs Locus specific databases +HGNC:5 A1BG alpha-1-B glycoprotein 1 ENSG00000121410 NM_130786 CCDS12976 +HGNC:37133 A1BG-AS1 A1BG antisense RNA 1 NCRNA00181, A1BGAS, A1BG-AS FLJ23569 503538 ENSG00000268895 NR_015380 +HGNC:24086 A1CF APOBEC1 complementation factor ACF, ASP, ACF64, ACF65, APOBEC1CF 29974 ENSG00000148584 NM_014576 CCDS7241, CCDS7242, CCDS7243, CCDS73133 +HGNC:7 A2M alpha-2-macroglobulin FWP007, S863-7, CPAMD5 2 ENSG00000175899 NM_000014 CCDS44827 +HGNC:27057 A2M-AS1 A2M antisense RNA 1 144571 ENSG00000245105 NR_026971 +HGNC:41022 A2ML1-AS1 A2ML1 antisense RNA 1 +HGNC:8 A2MP1 alpha-2-macroglobulin pseudogene 1 A2MP 3 ENSG00000256069 NG_001067 +HGNC:30005 A3GALT2 alpha 1,3-galactosyltransferase 2 A3GALT2P IGBS3S, IGB3S 127550 ENSG00000184389 NM_001080438 CCDS60080 +HGNC:18149 A4GALT alpha 1,4-galactosyltransferase (P blood group) P1 A14GALT, Gb3S, P(k) 53947 ENSG00000128274 NM_017436 CCDS14041 "Global Variome shared LOVD|https://databases.lovd.nl/shared/genes/ABCB7","LRG_795|https://ftp.ebi.ac.uk/pub/databases/lrgex/pending/LRG_795.xml" +HGNC:17968 A4GNT alpha-1,4-N-acetylglucosaminyltransferase alpha4GnT 51146 ENSG00000118017 NM_016161 CCDS3097 "Global Variome shared LOVD|https://databases.lovd.nl/shared/genes/ACE2" +HGNC:13666 AAAS aladin WD repeat nucleoporin 8086 ENSG00000094914 CCDS8856, CCDS53797 +HGNC:30205 AAMDC adipogenesis associated Mth938 domain containing C11orf67 PTD015, FLJ21035, CK067 28971 ENSG00000087884 NM_024684 CCDS8254, CCDS81604, CCDS81605, CCDS86232 +HGNC:18 AAMP angio associated migratory cell protein 14 ENSG00000127837 NM_001087 CCDS33378, CCDS77530 +HGNC:19 AANAT aralkylamine N-acetyltransferase SNAT 15 ENSG00000129673 NM_001088 CCDS11745, CCDS54169 +HGNC:15886 AAR2 AAR2 splicing factor homolog C20orf4 bA234K24.2 25980 ENSG00000131043 NM_015511 CCDS13273 +HGNC:33842 AARD alanine and arginine rich domain containing protein C8orf85 LOC441376 441376 ENSG00000205002 NM_001025357 CCDS34935 +HGNC:20 AARS alanyl-tRNA synthetase CMT2N, AlaRS 16 ENSG00000090861 NM_001605 CCDS32474 "LRG_359|https://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_359.xml" +HGNC:21022 AARS2 alanyl-tRNA synthetase 2, mitochondrial AARSL KIAA1270, bA444E17.1 57505 ENSG00000124608 NM_020745 CCDS34464 +HGNC:28417 AARSD1 alanyl-tRNA synthetase domain containing 1 MGC2744 80755 ENSG00000266967 NM_001261434 CCDS11447, CCDS45691, CCDS58552 +HGNC:49894 AARSP1 alanyl-tRNA synthetase pseudogene 1 ENSG00000249038 diff --git a/src/python/test/xrefs/parsers/flatfiles/hpa.txt b/src/python/test/xrefs/parsers/flatfiles/hpa.txt new file mode 100644 index 000000000..5deb80b99 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/hpa.txt @@ -0,0 +1,11 @@ +Antibody,antibody_id,ensembl_peptide_id,link +CAB000001,1,ENSP00000363822,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000001,1,ENSP00000379358,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000001,1,ENSP00000379359,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000001,1,ENSP00000421155,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000001,1,ENSP00000479013,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000001,1,ENSP00000482407,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000001,1,ENSP00000484033,http://www.proteinatlas.org/ENSG00000169083-AR +CAB000002,2,ENSP00000224784,http://www.proteinatlas.org/ENSG00000107796-ACTA2 +CAB000002,2,ENSP00000396730,http://www.proteinatlas.org/ENSG00000107796-ACTA2 +CAB000002,2,ENSP00000398239,http://www.proteinatlas.org/ENSG00000107796-ACTA2 diff --git a/src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta b/src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta new file mode 100644 index 000000000..25208108b --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta @@ -0,0 +1,108 @@ +>ci0100130000 +MPLEENISSSKRKPGSRGGVSFFSYFTQELTHGYFMDQNDARYTERRERVYTFLKQPREIEKVRPFPPFL +CLDVFLYVFTFLPLRVLFALLKLLSAPFCWFQRRSLLDPAQSCDLLKGVIFTSCVFCMSYIDTSIIYHLV +RAQTLIKLYIIYNMLEVADRLFSSFGQDILDALFLTATESNRQKRESFRVLLHLILAVIYVFSHAVLVLF +EATTLNVAFNSHNKVLLTIMMANNFVEIKGTVFKKYDKNNLFQISCSDIRERFHYFALMLVVLLRNMQQY +SWNYEHFTEIIPNMLMLLSSECVVDWFKHAFVLKFNHIPIESYSEYRATLAYDVASSRHKDSINDHSDVV +SRRLGFIPLPLAVLVSYSSALLLPVSDFSVCSSVLVYRIKKRFV*MHFSSLTLLKVFNSIVIVGKACCYI +SDDEAQAANVRVNGARIAVVDPFEQRGNKTILVSQARAQPPEPTVKPPASGDPGLDSKKLLLSPEKNRKL +PKEVTTPARLRSMRAPSVDHTVAAGTNLPSRNDDDVGDVDVLRHQAPDSVRSRKRHTATIVKATAIDEEI +H* +>ci0100130001 +MLPIVDFKQCRPSVEASDKEINETAKLLVDALSTVGFAYLKNCGIKKNCRRSQKHRG*MGGVRYLYYPPI +KGELELNQERLGEHSDYGSITLLFVDDNGGLQIETEGTYKDVPVIEDTILINIGDALEFWTKGKLRSTKH +RVNIPDDEVKRNSIRRSIGYFVFPDDDVVINQPLQFKGDADVPDPVKDPITALKYIQQKLSHTCQNT* +>ci1100130002 +MNWKTWEEMENDLGIYYRPTNRKLDRRKGPIEEGQINFKITIPSTLKRKIKHDVDKNLNEELIENADKQQ +NTEEQSHSMDQIFSSTQIGASVSHNVEDLHSVKRPRLSPIIAKSKPAVHSTSVIINPSDEESDSVFDKTK +SRADVSHKSIPIHADENLAQSSVHLDVENSVLSDKSFDNSKNASNRFDLPATASKPTKSTQQNESEMFLI +SESATLNESYHQVLSKAKHFLGKFKPKKIPLKVNNNQTKTSNTDKPRKIKPPKGFDGFAVVPPINPASSS +AKHRTTSTEVNRISSNLAQWRYTLEQRLLSQSSDS*MVASAIYVLDLKGKVLISRNYRGNIPMNAIDAFP +KLLLEQEEEGTLTPVLMHGDITFVFIRFSNLYMVATTNKNSNVMMISSFMHKLCQIFAHYFKELEEESIK +DNFVIVYELFDEVMDFGYPQFSDPKILQEYITQEGHKLEIQVRPPSTVTNAVSWRSEGLKYRKNEVFLDV +IESVNLLVSSTGNVLRSEIVGSVKMRVYLTGMPELRLGLNDKVLFQNTGRGKSKAVEMEDVKFHQCVRLS +RFENDRTISFIPPDGEFELMSYRLNTHVKPLIWIESVIERHSHSRVEIMVKAKSQFKRRSTANNVEIQIP +VPNDADTPKFKTSVGSVKWVPETSNIVWTVKSFPGGKEYLMRAHFGLPSVESEELEGKPPISVKFEIPYF +TTSGIQVRYLKIIEKSGYQALPWVRYITQNGDYQLRTN* +>ci0100130003 +MPPKKKKEVEKPPLILGRLGTSLKIGIVGLPNVGKSTFFNVLTKSEASAENFPFCTIDPNESRVPVPDER +WEFLCKYHKPASKVPAFLSVVDIAGLVKGANEGQGLGNAFLSHISGCDAIFHMTRAFDDAEVVHVEGDVN +PVRDLEIIQEELRLKDVEHLTKRLAELEKVYSRGGEKKYKLEFETLSKIKTLLVDEKKPVRDGEWGGKEI +EVLNEHLFLTSKPQIYLVNLSEKDYIRKKNKWLMKIKTWVTENDSSAILIPFSGAFELKLAEMADDAERK +AYLEEQYKDSVGSALSKIVVTGFKCLGLQYFFTAGADEVKAWTIKTGFLAPQAAGRIHTDFEKGFIMAEV +MKFSDFKELGSESAVKSAGKYRQQGRNYIVEDGDIIFFKFNTPSQPKKK*MSQLAEMADDAERKAYLEEQ +YKDSVGSALSKIVVTGFKCLGLQYFFTAGADEVKAWTIKTGFLAPQAAGRIHTDFEKGFIMAEVMKFSDF +KELGSESAVKSAGKYRQQGRNYIVEDGDIIFFKFNTPSQPKKK* +>ci0100130004 +VFKVCLNHFITEAIHFNFKENSDKVLVWAATDFSDPDKPNGEMLQFAMKLKSAETAINFLNTVQDGNEAF +SVKRLLDPVSVLEDKSEINTSVQDASNASQQSENGNTSVHKSPVKSTKPAFSFANVATPFGNKNKPLFSD +IVFGMLMVLFNFFK*MTSQPGTPKTQPAAPTTKFTFDASSISFNFGSTSTPATSAPPFQIAPAVMQKPAA +SKSLFGVVQPSTGNNDASQQKEQNTIIGQHSSKFQFDMATADNDQNGVDTNVGEKETKKVEKGKNSIFLC +SCYMYSYIICSAPWLSGLGACIGTKVFPVQCSPAILTLITYVSLGKTLNGHCSNPAVTNGLSKS*MLFTG +ASQSPDADPEAFNPDYKPVVAELPPLIEMKTGEEEEEILFKERCKMFRFDNSISNWKERGLGELKILFHK +GMNLHRVVMRREQVFKVCANHLITKDMNLLPNSDKSWMYVANNKSDGEAEVEKLSVKFKTPQIANQFKEI +WDTCRHGS* +>ci0100130005 +CQICFETYTRPKSLNCQHTFCLKCLEEYTPPNSVRVICPTCRSEQPLTADGINGLKDNFFISSMSDMLKT +VKEIRSEDKDGTSLMCDTCDHDNRKVAIARCLDCTDFLCNECSTWHIRTKLTRRHKIVSLSEFESGIHNQ +ELKSRAKIYCMIHDGEAAKIYCQSCQCPICHECVESGHSRHQLGKQGIADGEFESTPNLAINSVNEVITA +DYDGAKIQIFDPQGNFKDSFVTEVRGVNKRMCKPAGIAILDNDDIVVCCEDQVHIWTHEGKSVLGFGKGQ +FGNCSSIAVNSENRIVVADVGKHCISVFTDTGKMLLQFGAQGKGESKLVEPRYVACDSQNNIIVSDGGDC +SVKKFSSQGEFLLSFGAEGPERGQFQGPRGLCTDEHDNILVADCWNHRVDIFTPDGCFMRHIATGADSLH +FPWCISLTTNGKLVLSEDYSWSVKIF +>ci0100130006 +MEKDTSLVKVVTEGNGNILKPFTNSNNVELHPMEDEVVVAALEQMSPAEQWELKQQKKREMKMQNELREK +IEAGCKIRQILASPDWKGSIDKLLIAKVKSRDWREVESIFALLKKEEYSKFSCQQACDEQMRGPLRIAVE +NKDIKMLELLLSEDIIKNEKIKVK*MISKARSSPAYILALGDVGKVNKLIARSKFIYEYLLIQADRNFDT +FLYCIKHIYELRQLAKVEHEFSNFYLQLVEDVEKFMCKLLDQYIFQSSPECDINNGLEEIGIGTRVRMLE +KACDYKLVNFVTHHNPQLAIEHLTYRNTPFFRTGNHITFYLTRIMLALMFPVLSIFNIINPKSRAGRLIT +YPCTSYDCRMMSEFLFVVFLVTNISNKKMHLEYLAAPPTTWEVLILIWVMGKFVQEINELNKRGLESYFF +DPWNHLDLWATILFAFNYAFRIVDYVKYHQVPVQQRPPRSEWYMFEWRLVAEGLMACAYVFVFIRLLGLT +RVDRTLGPLQISLARMVKDVVQFLCIFAFILFAFALALTELYWFYGTPKGKEISCDVGVRSNLTNTTASC +PEINTMFHSVWYSMIDLFWSLFGQLDMSKLSLSGKHLFTEYVAKALLAIYHVIAIIVLLNMLIAMMSRSY +ERTSENEEKEWKFQRTKMWIRILRREIIRPPPMNLLPSFKTIWYYLKRLKRLCCFFLVHLIRCRCSTIKR +SFFPGQHRVKYQALNYHKARRNLISKYKTNILLSSENDCT* +>ci0100130007 +MDRIENLLDTGGRYLSPADRQFVSTLLSELEQFQYQAPKDRALMFHPLGGFQRYLIHKVTEVFPKLTSFS +IGDDSNRRTVVCFKSKKKDQQQGLQANGTSKTLPNPKNEYVNAPVRGREEEGRQSAPRSRDSSRSRKKNE +PYDANTSKQPRQPKQLDQAYLPKPLRTKKATNKRDQMKRSRSLQSSPVRMDEEVYHTDDDGRRGRRSKGS +ESGKRAPRSASLKPSSRRRERDVSPEPSEDEFRRQKQHPPLRHNVSDVSIHKRSQGLRMEGKVIKLLSLC +VSQQLWFTNNSVVTYLFTLG*MDEDSSNDYYEDEDSASMASDERPPSYERVKHSSSIKKAERNLNRIKRV +DSQRSKPTSDEDVQIRDSSSSSSGRTVPRGSTHKSPPASTSSHHSKEGKMQRSSTMPRSTKEPAKKQRSS +SKGRSQTLKGPPVSTGRSKSQSTKKHPAPPPPQKEEEKPKRSSSSKQRKKPTSLDEPKKHSSVVTSTDDL +LDASNSTLYHPLPDQEQPSTSKKKSTKEXTPPSDPNAISPVTCDGTAQTEVSGTLSRRKDNRLRESTDTT +PQDSPSHQPSNPDYSKDDVDGFLSGSHASSCSSLNSAPPVPKETKNTEAPPKETTQPPIPTENIPKPQDI +EPATNSVDVEVHENDEATGAEENSESESDAASTKSNVGSIDRSAGDISIYSLDASDGSDKEDEDDKQESD +KERSGESSDSEEATWPAPPVPVNGDGVDKFKSPVSGDEQPSPPQEPATVTQTIDIVTSPTDNADNEEIEQ +HQSAGSVTSSSSEPFAAPQHPEVPLIDVSDSDASSTATEGEKAAENGDLGNGEGERSNSSSDLSTSDENH +KMSDGSYDGDQSRTEESILADKEQPATNGTNQVVEVDGEMVVVQEPQFDYYKWKPDQDVWKSPEYKKFVE +IFNFPASMSDIEVTQHLSKYRGLRLARVDATHALCTLPSDVIAEELADQHFTAFETRPLCDASKQTKAKA +KAKLENEKFEEVRAQRPKSSNAVAKRMIAGALGSSHRSSANKQSTKK* +>ci0100130008 +MKEFLVWSKMNRTTKLKLNGACTVPLSMKGINLKRVKIKRLKCGVWGCTISSNISFVSKHTEIKVLKTTA +KYCDRVNVTNDMGNFTWHEAVTGVTVRYNCTEPQQEVS*MTENLDPGLDLYSLSVLPQKLHVHNCIKYRT +GYTCDTTQMMLERNISRENAWSVCSNAVGNLSGVIVRDSIGFQLLGEVIGNCSEHLQENMKSSNIALAGY +NFNVTDQKYIYCSISEAIEWTELTKYPELELVCQDLLLFNFSILLFFHSFLFFILHTYITITVTLGVFVA +TTFKTIHLLHPGFKSKTAFSLLLFVHTN*MRHLPILHRNGNLFQSSTTTTMSLMEYKAWKNHLQSSQQED +KGKIVAKQVDFLADMTKFKQELKKNIKPDPERHVLVSSIYSVHVNNRNVVNLTKPAVYIFHTQESVSQLN +LDYEHYIAMWNGSGWNRAHHHCIFNHTTRDQQQHNITVIQCDVLATFAIMKV*TLTHPTVYAGSAVLTLT +LLLMLITYAVFRNLLLSRDARHMIINTTLHLLVAVLTFTVGVWSISSKVMCYVTGILLHYSSLSVLLWIT +LSSGNICKEMLAAQQPPLLEPKPSKPMLRFYLIGCGIPIIICGITASAKIENYNGDGGQYCWLSWETSLY +AFYAPAACIAIFCILVLLRILATLNCAPSGEMKRKSRKRRKREYSKEFIGEDTPLQYMESSFTQHSNPVN +NSFSKDVENEQSSKTRLQGVALILVLFITTWVTAAMTVAAPRIQEQKISSHVRQTLFNQNYFKFFKQEEP +TPSIDLHLIFSCIFAMMCIAMSSFLLIQHLTSRSDVRRSWRNLCNRRKKKVLEAESNIQINNDIATNIPD +RKRENTVTTAATDPTTTETALHLTTGGESSRGNLTSRHSPFGRNSSAPLPGGTYDTSRASSAHKCAQFHR +EKALSNTLTESGLLVPHSNSSLLLPSDPNNFYTLTEHQHGDSFHSPSQEWGYHGYGQHYYKSSSKPVKMT +NLQQHQLDSSMTEHSFDDSHNNMHTIPVVLQHTQPKIDNKVLYHRYQKMRKALDAKRNRQKKLTVLREYA +QDPLTSNDESPTKPQKSIDKSSEQIPLLSNVSKCHNANEDEMGLDNLVNTDNIITLPMPKPADEGEQNYR +LLLISSPNKKKGETCMQGKYGQRKRVTGDPNLVSSYRRKSLQQNTVPLEPGGSSRSAKRRRPPASRRRAR +QRATRHETQIKAAASHTEATASQVEATAPPVTDNVSVKSRMSANQAAAEARSSGWAMHQDKAHNYLTAQD +LFSGAMHLPSRKSSSCQAPSNTEPQNPGFQIIDMDDNNVGHENATTSPTEDTDLAYAALRNETSV* +>ci0100130009 +MNNIVPSMVKERIRLIKPSRPQAPTEYNLQYKKWPLRPFTLDTFRVVKKQPKKELIEHKDPITQPTHGLC +KMGSFNSDYPCSTASSDGLPTAAVEKRQNKTNDSIVALPPSIDVGEVIDESLSSTLRTQTAKMHERQPKQ +KSPRKPGKSSGKYASKNREGRSGSSFQAVESXMNEDGRQWASDQPPNTEYLQNLPSDNSLDKINPVRSES +IILKPFDQQTNDLPRTPQKSCKHLVSKEGRRSKPFSVDSVGEDSLRQLYSNIPPNDLETLRKTYKGLLGG +FIKVYIWLVLLINTATHLNCLIAVKQNTCVKTTASGV*MKMNMTSSSTFKAHKPKPPEAPIWANKTIVSL +GSGHWNLPMNKGEEWKVQIPEDCKVPVGQKPEYKIRVHNDAKSSDVPMTDLESNEKKTDRTNSDPFIALY +YPDRRTTSSYVSEYMRKFHRFPVQKKRVLV* diff --git a/src/python/test/xrefs/parsers/flatfiles/mgi.txt b/src/python/test/xrefs/parsers/flatfiles/mgi.txt new file mode 100644 index 000000000..9cae91f0e --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/mgi.txt @@ -0,0 +1,10 @@ +MGI:1915733 1110002O04Rik RIKEN cDNA 1110002O04 gene -1.0 1 ENSMUSG00000102531 ENSMUST00000194261 lincRNA gene 35879845 35881119 + lincRNA|ncRNA +MGI:1926146 1500015O10Rik RIKEN cDNA 1500015O10 gene 23.44 1 ENSMUSG00000026051 ENSMUST00000027217 ENSMUSP00000027217 protein coding gene 43730602 43742564 + protein-coding|protein_coding +MGI:1919275 1600012P17Rik RIKEN cDNA 1600012P17 gene 68.49 1 ENSMUSG00000047661 ENSMUST00000062159 ENSMUST00000162474 lincRNA gene 158967701 158980463 - lincRNA +MGI:1914753 1700001G17Rik RIKEN cDNA 1700001G17 gene 12.78 1 ENSMUSG00000103746 lncRNA gene 33669824 33670712 + TEC|ncRNA +MGI:1916606 1700003I22Rik RIKEN cDNA 1700003I22 gene -1.0 1 ENSMUSG00000100372 ENSMUST00000190280 ENSMUST00000186048 lincRNA gene 56018978 56020203 + ncRNA|lincRNA +MGI:1925628 1700006P03Rik RIKEN cDNA 1700006P03 gene -1.0 1 ENSMUSG00000102738 unclassified gene 137325434 137325842 - TEC +MGI:1916558 1700007P06Rik RIKEN cDNA 1700007P06 gene 90.37 1 ENSMUSG00000089730 ENSMUST00000160380 antisense lncRNA gene 187125138 187127852 + ncRNA|antisense +MGI:1923817 1700012E03Rik RIKEN cDNA 1700012E03 gene -1.0 1 ENSMUSG00000101275 ENSMUST00000186237 lincRNA gene 120435805 120438455 + lincRNA +MGI:1916678 1700016C15Rik RIKEN cDNA 1700016C15 gene 82.8 1 ENSMUSG00000015962 ENSMUST00000016106 ENSMUSP00000016106 protein coding gene 177729814 177753324 + protein_coding|protein-coding +MGI:1919458 1700016L21Rik RIKEN cDNA 1700016L21 gene -999.0 1 ENSMUSG00000101483 ENSMUST00000187497 ENSMUST00000189139 antisense lncRNA gene 80445932 80475660 + antisense|ncRNA diff --git a/src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt b/src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt new file mode 100644 index 000000000..f3ffe6388 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt @@ -0,0 +1,11 @@ +MGI Accession ID Chr cM Position genome coordinate start genome coordinate end strand Marker Symbol Status Marker Name Marker Type Feature Type Marker Synonyms (pipe-separated) +MGI:1341858 5 syntenic 03B03F O DNA segment, 03B03F (Research Genetics) BAC/YAC end BAC/YAC end +MGI:1341869 5 syntenic 03B03R O DNA segment, 03B03R (Research Genetics) BAC/YAC end BAC/YAC end +MGI:1337005 11 syntenic 03.MMHAP34FRA.seq O DNA segment, 03.MMHAP34FRA.seq DNA Segment DNA segment +MGI:1918911 7 29.36 45567795 45575176 - 0610005C13Rik O RIKEN cDNA 0610005C13 gene Gene antisense lncRNA gene +MGI:1923503 7 syntenic 74818818 74853813 - 0610006L08Rik O RIKEN cDNA 0610006L08 gene Gene lincRNA gene +MGI:1925547 UN N/A 0610008J02Rik O RIKEN cDNA 0610008J02 gene Gene unclassified gene +MGI:1913300 11 31.26 51685386 51688874 - 0610009B22Rik O RIKEN cDNA 0610009B22 gene Gene protein coding gene +MGI:3698435 2 18.90 26445605 26457995 + 0610009E02Rik O RIKEN cDNA 0610009E02 gene Gene unclassified non-coding RNA gene +MGI:1918921 16 syntenic 91947326 91947785 0610009F21Rik O RIKEN cDNA 0610009F21 gene Gene unclassified gene +MGI:1926146 1 23.44 43730602 43742564 + 1500015O10Rik O RIKEN cDNA 1500015O10 gene Gene protein coding gene Ecrg4|augurin diff --git a/src/python/test/xrefs/parsers/flatfiles/mim.txt b/src/python/test/xrefs/parsers/flatfiles/mim.txt new file mode 100644 index 000000000..e44b71186 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/mim.txt @@ -0,0 +1,122 @@ +*RECORD* +*FIELD* NO +100050 +*FIELD* TI +100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT +*FIELD* TX + +DESCRIPTION + +Aarskog syndrome is characterized by short stature and facial, limb, +and genital anomalies. One form of the disorder is X-linked (see +305400), but there is also evidence for autosomal dominant and +autosomal recessive (227330) inheritance (summary by Grier et al., +1983). + +*RECORD* +*FIELD* NO +100070 +*FIELD* TI +%100070 AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1 +;;ANEURYSM, ABDOMINAL AORTIC; AAA;; +ABDOMINAL AORTIC ANEURYSM +*FIELD* TX + +DESCRIPTION + +Abdominal aortic aneurysm is a multifactorial disorder with multiple +genetic and environmental risk factors. The disorder may occur as part +of a heritable syndrome or in isolation (summary by Kuivaniemi et al., +2003). + +*RECORD* +*FIELD* NO +100100 +*FIELD* TI +#100100 PRUNE BELLY SYNDROME; PBS +;;ABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TRACT ABNORMALITY AND +CRYPTORCHIDISM;; +EAGLE-BARRETT SYNDROME; EGBRS +*FIELD* TX + +A number sign (#) is used with this entry because of evidence that +prune belly syndrome (PBS) is caused by homozygous mutation in the +CHRM3 gene (118494) on chromosome 1q43. One such family has been +reported. + +*RECORD* +*FIELD* NO +100500 +*FIELD* TI +^100500 MOVED TO 100650 +*FIELD* TX + +fnord + +*RECORD* +*FIELD* NO +100640 +*FIELD* TI +*100640 ALDEHYDE DEHYDROGENASE 1 FAMILY, MEMBER A1; ALDH1A1 +;;ALDEHYDE DEHYDROGENASE 1; ALDH1;; +ACETALDEHYDE DEHYDROGENASE 1;; +ALDH, LIVER CYTOSOLIC;; +RETINAL DEHYDROGENASE 1; RALDH1 +*FIELD* TX + +DESCRIPTION + +The ALDH1A1 gene encodes a liver cytosolic isoform of acetaldehyde +dehydrogenase (EC 1.2.1.3), an enzyme involved in the major pathway of +alcohol metabolism after alcohol dehydrogenase (ADH, see 103700). See +also liver mitochondrial ALDH2 (100650), variation in which has been +implicated in different responses to alcohol ingestion. + +*RECORD* +*FIELD* NO +100650 +*FIELD* TI +^100650 MOVED TO 200150 +*FIELD* TX + +This entry was incorporated into 200150 on March 2, 2004. + +*RECORD* +*FIELD* NO +100680 +*FIELD* TI +^100680 MOVED TO 100740 +*FIELD* TX + +This entry was incorporated into entry 100740 on August 4, 2010. + +*RECORD* +*FIELD* NO +100740 +*FIELD* TI +^100740 REMOVED FROM DATABASE +*FIELD* TX + +fnord + +*RECORD* +*FIELD* NO +200150 +*FIELD* TI ++200150 CHOREOACANTHOCYTOSIS; CHAC +;;LEVINE-CRITCHLEY SYNDROME;; +ACANTHOCYTOSIS WITH NEUROLOGIC DISORDER;; +NEUROACANTHOCYTOSIS;; +CHOREA-ACANTHOCYTOSIS +*FIELD* TX + +A number sign (#) is used with this entry because choreoacanthocytosis +can be caused by homozygous or compound heterozygous mutation in the +VPS13A gene (605978), which encodes chorein, on chromosome 9q21. + +DESCRIPTION + +Choreoacanthocytosis (CHAC) is a rare disorder characterized by +progressive neurodegeneration and red cell acanthocytosis, with onset +in the third to fifth decade of life (Rubio et al., 1997). +*THEEND* diff --git a/src/python/test/xrefs/parsers/flatfiles/mim2gene.txt b/src/python/test/xrefs/parsers/flatfiles/mim2gene.txt new file mode 100644 index 000000000..dfbe2a148 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/mim2gene.txt @@ -0,0 +1,10 @@ +#MIM number GeneID type Source MedGenCUI Comment +100050 - phenotype - C3149220 - +100070 - phenotype - C1853365 - +100100 1131 phenotype GeneMap C0033770 question +100200 - phenotype - C4551519 - +100300 57514 phenotype GeneMap C4551482 - +100600 - phenotype - C0000889 - +100640 216 gene - - - +100650 217 gene - - - +100660 218 gene - - - diff --git a/src/python/test/xrefs/parsers/flatfiles/mirbase.txt b/src/python/test/xrefs/parsers/flatfiles/mirbase.txt new file mode 100644 index 000000000..43a5cef15 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/mirbase.txt @@ -0,0 +1,506 @@ +ID cel-let-7 standard; RNA; CEL; 99 BP. +XX +AC MI0000001; +XX +DE Caenorhabditis elegans let-7 stem-loop +XX +RN [1] +RX PUBMED; 11679671. +RA Lau NC, Lim LP, Weinstein EG, Bartel DP; +RT "An abundant class of tiny RNAs with probable regulatory roles in +RT Caenorhabditis elegans"; +RL Science. 294:858-862(2001). +XX +RN [2] +RX PUBMED; 12672692. +RA Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB, +RA Bartel DP; +RT "The microRNAs of Caenorhabditis elegans"; +RL Genes Dev. 17:991-1008(2003). +XX +RN [3] +RX PUBMED; 12747828. +RA Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D; +RT "MicroRNAs and other tiny endogenous RNAs in C. elegans"; +RL Curr Biol. 13:807-818(2003). +XX +RN [4] +RX PUBMED; 12769849. +RA Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J; +RT "Computational and experimental identification of C. elegans microRNAs"; +RL Mol Cell. 11:1253-1263(2003). +XX +RN [5] +RX PUBMED; 17174894. +RA Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP; +RT "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and +RT endogenous siRNAs in C. elegans"; +RL Cell. 127:1193-1207(2006). +XX +RN [6] +RX PUBMED; 19460142. +RA Kato M, de Lencastre A, Pincus Z, Slack FJ; +RT "Dynamic expression of small non-coding RNAs, including novel microRNAs +RT and piRNAs/21U-RNAs, during Caenorhabditis elegans development"; +RL Genome Biol. 10:R54(2009). +XX +RN [7] +RX PUBMED; 20062054. +RA Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo +RA GW; +RT "Comprehensive discovery of endogenous Argonaute binding sites in +RT Caenorhabditis elegans"; +RL Nat Struct Mol Biol. 17:173-179(2010). +XX +DR RFAM; RF00027; let-7. +DR WORMBASE; C05G5/12462-12364; . +XX +CC let-7 is found on chromosome X in Caenorhabditis elegans [1] and pairs to +CC sites within the 3' untranslated region (UTR) of target mRNAs, specifying +CC the translational repression of these mRNAs and triggering the transition +CC to late-larval and adult stages [2]. +XX +FH Key Location/Qualifiers +FH +FT miRNA 17..38 +FT /accession="MIMAT0000001" +FT /product="cel-let-7-5p" +FT /evidence=experimental +FT /experiment="cloned [1-3], Northern [1], PCR [4], 454 [5], +FT Illumina [6], CLIPseq [7]" +FT miRNA 60..81 +FT /accession="MIMAT0015091" +FT /product="cel-let-7-3p" +FT /evidence=experimental +FT /experiment="CLIPseq [7]" +XX +SQ Sequence 99 BP; 26 A; 19 C; 24 G; 0 T; 30 other; + uacacugugg auccggugag guaguagguu guauaguuug gaauauuacc accggugaac 60 + uaugcaauuu ucuaccuuac cggagacaga acucuucga 99 +// +ID cel-lin-4 standard; RNA; CEL; 94 BP. +XX +AC MI0000002; +XX +DE Caenorhabditis elegans lin-4 stem-loop +XX +RN [1] +RX PUBMED; 11679671. +RA Lau NC, Lim LP, Weinstein EG, Bartel DP; +RT "An abundant class of tiny RNAs with probable regulatory roles in +RT Caenorhabditis elegans"; +RL Science. 294:858-862(2001). +XX +RN [2] +RX PUBMED; 10642801. +RA Olsen PH, Ambros V; +RT "The lin-4 regulatory RNA controls developmental timing in Caenorhabditis +RT elegans by blocking LIN-14 protein synthesis after the initiation of +RT translation"; +RL Dev Biol. 216:671-680(1999). +XX +RN [3] +RX PUBMED; 12672692. +RA Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB, +RA Bartel DP; +RT "The microRNAs of Caenorhabditis elegans"; +RL Genes Dev. 17:991-1008(2003). +XX +RN [4] +RX PUBMED; 12747828. +RA Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D; +RT "MicroRNAs and other tiny endogenous RNAs in C. elegans"; +RL Curr Biol. 13:807-818(2003). +XX +RN [5] +RX PUBMED; 17174894. +RA Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP; +RT "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and +RT endogenous siRNAs in C. elegans"; +RL Cell. 127:1193-1207(2006). +XX +RN [6] +RX PUBMED; 19460142. +RA Kato M, de Lencastre A, Pincus Z, Slack FJ; +RT "Dynamic expression of small non-coding RNAs, including novel microRNAs +RT and piRNAs/21U-RNAs, during Caenorhabditis elegans development"; +RL Genome Biol. 10:R54(2009). +XX +RN [7] +RX PUBMED; 20062054. +RA Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo +RA GW; +RT "Comprehensive discovery of endogenous Argonaute binding sites in +RT Caenorhabditis elegans"; +RL Nat Struct Mol Biol. 17:173-179(2010). +XX +DR RFAM; RF00052; lin-4. +DR WORMBASE; F59G1/6156-6249; . +XX +CC lin-4 is found on chromosome II in Caenorhabditis elegans [1] and is +CC complementary to sequences in the 3' untranslated region (UTR) of lin-14 +CC mRNA. lin-4 acts to developmentally repress the accumulation of lin-14 +CC protein. This repression is essential for the proper timing of numerous +CC events of Caenorhabditis elegans larval development [2]. +XX +FH Key Location/Qualifiers +FH +FT miRNA 16..36 +FT /accession="MIMAT0000002" +FT /product="cel-lin-4-5p" +FT /evidence=experimental +FT /experiment="cloned [1,3-4], 454 [5], Illumina [6], +FT CLIPseq [7]" +FT miRNA 55..76 +FT /accession="MIMAT0015092" +FT /product="cel-lin-4-3p" +FT /evidence=experimental +FT /experiment="CLIPseq [7]" +XX +SQ Sequence 94 BP; 17 A; 25 C; 26 G; 0 T; 26 other; + augcuuccgg ccuguucccu gagaccucaa gugugagugu acuauugaug cuucacaccu 60 + gggcucuccg gguaccagga cgguuugagc agau 94 +// +ID cel-mir-1 standard; RNA; CEL; 96 BP. +XX +AC MI0000003; +XX +DE Caenorhabditis elegans miR-1 stem-loop +XX +RN [1] +RX PUBMED; 11679671. +RA Lau NC, Lim LP, Weinstein EG, Bartel DP; +RT "An abundant class of tiny RNAs with probable regulatory roles in +RT Caenorhabditis elegans"; +RL Science. 294:858-862(2001). +XX +RN [2] +RX PUBMED; 11679672. +RA Lee RC, Ambros V; +RT "An extensive class of small RNAs in Caenorhabditis elegans"; +RL Science. 294:862-864(2001). +XX +RN [3] +RX PUBMED; 11679670. +RA Lagos-Quintana M, Rauhut R, Lendeckel W, Tuschl T; +RT "Identification of novel genes coding for small expressed RNAs"; +RL Science. 294:853-858(2001). +XX +RN [4] +RX PUBMED; 12672692. +RA Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB, +RA Bartel DP; +RT "The microRNAs of Caenorhabditis elegans"; +RL Genes Dev. 17:991-1008(2003). +XX +RN [5] +RX PUBMED; 12747828. +RA Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D; +RT "MicroRNAs and other tiny endogenous RNAs in C. elegans"; +RL Curr Biol. 13:807-818(2003). +XX +RN [6] +RX PUBMED; 12769849. +RA Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J; +RT "Computational and experimental identification of C. elegans microRNAs"; +RL Mol Cell. 11:1253-1263(2003). +XX +RN [7] +RX PUBMED; 17174894. +RA Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP; +RT "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and +RT endogenous siRNAs in C. elegans"; +RL Cell. 127:1193-1207(2006). +XX +RN [8] +RX PUBMED; 19460142. +RA Kato M, de Lencastre A, Pincus Z, Slack FJ; +RT "Dynamic expression of small non-coding RNAs, including novel microRNAs +RT and piRNAs/21U-RNAs, during Caenorhabditis elegans development"; +RL Genome Biol. 10:R54(2009). +XX +RN [9] +RX PUBMED; 20062054. +RA Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo +RA GW; +RT "Comprehensive discovery of endogenous Argonaute binding sites in +RT Caenorhabditis elegans"; +RL Nat Struct Mol Biol. 17:173-179(2010). +XX +RN [10] +RX PUBMED; 21307183. +RA Warf MB, Johnson WE, Bass BL; +RT "Improved annotation of C. elegans microRNAs by deep sequencing reveals +RT structures associated with processing by Drosha and Dicer"; +RL RNA. 17:563-577(2011). +XX +DR RFAM; RF00103; mir-1. +DR WORMBASE; T09B4/23107-23012; . +XX +CC miR-1 was independently identified in C. elegans [1,2] and Drosophila +CC melanogaster (MIR:MI0000116) [3]. The sequence is also conserved in C. +CC briggsae (MIR:MI0000493). +XX +FH Key Location/Qualifiers +FH +FT miRNA 21..42 +FT /accession="MIMAT0020301" +FT /product="cel-miR-1-5p" +FT /evidence=experimental +FT /experiment="Illumina [10]" +FT miRNA 61..81 +FT /accession="MIMAT0000003" +FT /product="cel-miR-1-3p" +FT /evidence=experimental +FT /experiment="cloned [1-2,4-5], Northern [1], Illumina +FT [10,8], PCR [6], 454 [7], CLIPseq [9]" +XX +SQ Sequence 96 BP; 32 A; 16 C; 23 G; 0 T; 25 other; + aaagugaccg uaccgagcug cauacuuccu uacaugccca uacuauauca uaaauggaua 60 + uggaauguaa agaaguaugu agaacggggu gguagu 96 +// +ID cel-mir-2 standard; RNA; CEL; 98 BP. +XX +AC MI0000004; +XX +DE Caenorhabditis elegans miR-2 stem-loop +XX +RN [1] +RX PUBMED; 11679671. +RA Lau NC, Lim LP, Weinstein EG, Bartel DP; +RT "An abundant class of tiny RNAs with probable regulatory roles in +RT Caenorhabditis elegans"; +RL Science. 294:858-862(2001). +XX +RN [2] +RX PUBMED; 11679672. +RA Lee RC, Ambros V; +RT "An extensive class of small RNAs in Caenorhabditis elegans"; +RL Science. 294:862-864(2001). +XX +RN [3] +RX PUBMED; 12672692. +RA Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB, +RA Bartel DP; +RT "The microRNAs of Caenorhabditis elegans"; +RL Genes Dev. 17:991-1008(2003). +XX +RN [4] +RX PUBMED; 12747828. +RA Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D; +RT "MicroRNAs and other tiny endogenous RNAs in C. elegans"; +RL Curr Biol. 13:807-818(2003). +XX +RN [5] +RX PUBMED; 12769849. +RA Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J; +RT "Computational and experimental identification of C. elegans microRNAs"; +RL Mol Cell. 11:1253-1263(2003). +XX +RN [6] +RX PUBMED; 17174894. +RA Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP; +RT "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and +RT endogenous siRNAs in C. elegans"; +RL Cell. 127:1193-1207(2006). +XX +RN [7] +RX PUBMED; 19460142. +RA Kato M, de Lencastre A, Pincus Z, Slack FJ; +RT "Dynamic expression of small non-coding RNAs, including novel microRNAs +RT and piRNAs/21U-RNAs, during Caenorhabditis elegans development"; +RL Genome Biol. 10:R54(2009). +XX +RN [8] +RX PUBMED; 20062054. +RA Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo +RA GW; +RT "Comprehensive discovery of endogenous Argonaute binding sites in +RT Caenorhabditis elegans"; +RL Nat Struct Mol Biol. 17:173-179(2010). +XX +RN [9] +RX PUBMED; 21307183. +RA Warf MB, Johnson WE, Bass BL; +RT "Improved annotation of C. elegans microRNAs by deep sequencing reveals +RT structures associated with processing by Drosha and Dicer"; +RL RNA. 17:563-577(2011). +XX +DR RFAM; RF00047; mir-2. +DR WORMBASE; M04C9/29652-29555; . +XX +FH Key Location/Qualifiers +FH +FT miRNA 20..41 +FT /accession="MIMAT0020302" +FT /product="cel-miR-2-5p" +FT /evidence=experimental +FT /experiment="Illumina [9]" +FT miRNA 61..83 +FT /accession="MIMAT0000004" +FT /product="cel-miR-2-3p" +FT /evidence=experimental +FT /experiment="cloned [1-4], PCR [5], 454 [6], Illumina +FT [7,9], CLIPseq [8]" +XX +SQ Sequence 98 BP; 27 A; 19 C; 22 G; 0 T; 30 other; + uaaacaguau acagaaagcc aucaaagcgg ugguugaugu guugcaaauu augacuuuca 60 + uaucacagcc agcuuugaug ugcugccugu ugcacugu 98 +// +ID cel-mir-34 standard; RNA; CEL; 97 BP. +XX +AC MI0000005; +XX +DE Caenorhabditis elegans miR-34 stem-loop +XX +RN [1] +RX PUBMED; 11679671. +RA Lau NC, Lim LP, Weinstein EG, Bartel DP; +RT "An abundant class of tiny RNAs with probable regulatory roles in +RT Caenorhabditis elegans"; +RL Science. 294:858-862(2001). +XX +RN [2] +RX PUBMED; 12672692. +RA Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB, +RA Bartel DP; +RT "The microRNAs of Caenorhabditis elegans"; +RL Genes Dev. 17:991-1008(2003). +XX +RN [3] +RX PUBMED; 12747828. +RA Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D; +RT "MicroRNAs and other tiny endogenous RNAs in C. elegans"; +RL Curr Biol. 13:807-818(2003). +XX +RN [4] +RX PUBMED; 12769849. +RA Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J; +RT "Computational and experimental identification of C. elegans microRNAs"; +RL Mol Cell. 11:1253-1263(2003). +XX +RN [5] +RX PUBMED; 17174894. +RA Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP; +RT "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and +RT endogenous siRNAs in C. elegans"; +RL Cell. 127:1193-1207(2006). +XX +RN [6] +RX PUBMED; 19460142. +RA Kato M, de Lencastre A, Pincus Z, Slack FJ; +RT "Dynamic expression of small non-coding RNAs, including novel microRNAs +RT and piRNAs/21U-RNAs, during Caenorhabditis elegans development"; +RL Genome Biol. 10:R54(2009). +XX +RN [7] +RX PUBMED; 20062054. +RA Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo +RA GW; +RT "Comprehensive discovery of endogenous Argonaute binding sites in +RT Caenorhabditis elegans"; +RL Nat Struct Mol Biol. 17:173-179(2010). +XX +DR WORMBASE; Y41G9A/23565-23469; . +XX +FH Key Location/Qualifiers +FH +FT miRNA 16..37 +FT /accession="MIMAT0000005" +FT /product="cel-miR-34-5p" +FT /evidence=experimental +FT /experiment="cloned [1-3], Northern [1], PCR [4], 454 [5], +FT Illumina [6], CLIPseq [7]" +FT miRNA 53..74 +FT /accession="MIMAT0015093" +FT /product="cel-miR-34-3p" +FT /evidence=experimental +FT /experiment="CLIPseq [7]" +XX +SQ Sequence 97 BP; 21 A; 27 C; 23 G; 0 T; 26 other; + cggacaaugc ucgagaggca gugugguuag cugguugcau auuuccuuga caacggcuac 60 + cuucacugcc accccgaaca ugucguccau cuuugaa 97 +// +ID cel-mir-35 standard; RNA; CEL; 97 BP. +XX +AC MI0000006; +XX +DE Caenorhabditis elegans miR-35 stem-loop +XX +RN [1] +RX PUBMED; 11679671. +RA Lau NC, Lim LP, Weinstein EG, Bartel DP; +RT "An abundant class of tiny RNAs with probable regulatory roles in +RT Caenorhabditis elegans"; +RL Science. 294:858-862(2001). +XX +RN [2] +RX PUBMED; 12672692. +RA Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB, +RA Bartel DP; +RT "The microRNAs of Caenorhabditis elegans"; +RL Genes Dev. 17:991-1008(2003). +XX +RN [3] +RX PUBMED; 12747828. +RA Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D; +RT "MicroRNAs and other tiny endogenous RNAs in C. elegans"; +RL Curr Biol. 13:807-818(2003). +XX +RN [4] +RX PUBMED; 17174894. +RA Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP; +RT "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and +RT endogenous siRNAs in C. elegans"; +RL Cell. 127:1193-1207(2006). +XX +RN [5] +RX PUBMED; 19460142. +RA Kato M, de Lencastre A, Pincus Z, Slack FJ; +RT "Dynamic expression of small non-coding RNAs, including novel microRNAs +RT and piRNAs/21U-RNAs, during Caenorhabditis elegans development"; +RL Genome Biol. 10:R54(2009). +XX +RN [6] +RX PUBMED; 20062054. +RA Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo +RA GW; +RT "Comprehensive discovery of endogenous Argonaute binding sites in +RT Caenorhabditis elegans"; +RL Nat Struct Mol Biol. 17:173-179(2010). +XX +RN [7] +RX PUBMED; 21307183. +RA Warf MB, Johnson WE, Bass BL; +RT "Improved annotation of C. elegans microRNAs by deep sequencing reveals +RT structures associated with processing by Drosha and Dicer"; +RL RNA. 17:563-577(2011). +XX +DR WORMBASE; Y62F5A/16840-16936; . +XX +FH Key Location/Qualifiers +FH +FT miRNA 22..44 +FT /accession="MIMAT0020303" +FT /product="cel-miR-35-5p" +FT /evidence=experimental +FT /experiment="Illumina [7]" +FT miRNA 61..82 +FT /accession="MIMAT0000006" +FT /product="cel-miR-35-3p" +FT /evidence=experimental +FT /experiment="cloned [1-3], Northern [1], 454 [4], Illumina +FT [5,7], CLIPseq [6]" +XX +SQ Sequence 97 BP; 20 A; 25 C; 22 G; 0 T; 30 other; + ucucggauca gaucgagcca uugcugguuu cuuccacagu gguacuuucc auuagaacua 60 + ucaccgggug gaaacuagca guggcucgau cuuuucc 97 +// +ID cel-mir-36 standard; RNA; CEL; 97 BP. +XX +AC MI0000007; +XX +SQ Sequence 97 BP; 20 A; 25 C; 22 G; 0 T; 30 other; + ucucggauca gaucgagcca uugcugguuu cuuccacagu gguacuuucc auuagaacua 60 + ucaccgggug gaaacuagca guggcucgau cuuuucc 97 +// diff --git a/src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt b/src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt new file mode 100644 index 000000000..3905bbc4a --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt @@ -0,0 +1,8 @@ +A0A075B6P5 R-HSA-109582 https://reactome.org/PathwayBrowser/#/R-HSA-109582 Hemostasis TAS Homo sapiens +A0A075B6S6 R-HSA-1280218 https://reactome.org/PathwayBrowser/#/R-HSA-1280218 Adaptive Immune System TAS Homo sapiens +A0A075B7I6 R-HSA-1280218 https://reactome.org/PathwayBrowser/#/R-HSA-1280218 Adaptive Immune System IEA Homo sapiens +A0A087WPF7 R-HSA-1643685 https://reactome.org/PathwayBrowser/#/R-HSA-1643685 Disease TAS Homo sapiens +A0A087WPF7 R-HSA-1643685 https://reactome.org/PathwayBrowser/#/R-HSA-1643685 Disease IEA Homo sapiens +A0A087WRR7 R-HSA-166658 https://reactome.org/PathwayBrowser/#/R-HSA-166658 Complement cascade TAS Homo sapiens +A0A096LNF2 R-HSA-166663 https://reactome.org/PathwayBrowser/#/R-HSA-166663 Initial triggering of complement TAS Homo sapiens +A0A096MK16 R-HSA-166786 https://reactome.org/PathwayBrowser/#/R-HSA-166786 Creation of C4 and C2 activators TAS Homo sapiens diff --git a/src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt b/src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt new file mode 100644 index 000000000..6b5bb0c3d --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt @@ -0,0 +1,14 @@ +ENSG00000000419 R-HSA-162699 https://reactome.org/PathwayBrowser/#/R-HSA-162699 Synthesis of dolichyl-phosphate mannose TAS Homo sapiens +ENSG00000000419 R-HSA-163125 https://reactome.org/PathwayBrowser/#/R-HSA-163125 Post-translational modification: synthesis of GPI-anchored proteins TAS Homo sapiens +ENSG00000000419 R-HSA-1643685 https://reactome.org/PathwayBrowser/#/R-HSA-1643685 Disease TAS Homo sapiens +ENSG00000000419 R-HSA-3781865 https://reactome.org/PathwayBrowser/#/R-HSA-3781865 Diseases of glycosylation TAS Homo sapiens +ENSG00000000419 R-HSA-392499 https://reactome.org/PathwayBrowser/#/R-HSA-392499 Metabolism of proteins TAS Homo sapiens +ENSX00000000419 R-HSA-446193 https://reactome.org/PathwayBrowser/#/R-HSA-446193 Biosynthesis of the N-glycan precursor (dolichol lipid-linked oligosaccharide, LLO) and transfer to a nascent protein TAS Homo sapiens +ENSG00000000419 R-HSA-446203 https://reactome.org/PathwayBrowser/#/R-HSA-446203 Asparagine N-linked glycosylation TAS Homo sapiens +ENST00000000233 R-HSA-199977 https://reactome.org/PathwayBrowser/#/R-HSA-199977 ER to Golgi Anterograde Transport TAS Homo sapiens +ENST00000000233 R-HSA-199991 https://reactome.org/PathwayBrowser/#/R-HSA-199991 Membrane Trafficking TAS Homo sapiens +ENST00000000233 R-HSA-392499 https://reactome.org/PathwayBrowser/#/R-HSA-392499 Metabolism of proteins TAS Homo sapiens +ENST00000000233 R-HSA-446203 https://reactome.org/PathwayBrowser/#/R-HSA-446203 Asparagine N-linked glycosylation TAS Homo sapiens +ENSP00000000233 R-HSA-199977 https://reactome.org/PathwayBrowser/#/R-HSA-199977 ER to Golgi Anterograde Transport TAS Homo sapiens +ENSP00000000233 R-HSA-199991 https://reactome.org/PathwayBrowser/#/R-HSA-199991 Membrane Trafficking TAS Homo sapiens +ENSP00000000233 R-HSA-392499 https://reactome.org/PathwayBrowser/#/R-HSA-392499 Metabolism of proteins TAS Homo sapiens diff --git a/src/python/test/xrefs/parsers/flatfiles/reactome_release.txt b/src/python/test/xrefs/parsers/flatfiles/reactome_release.txt new file mode 100644 index 000000000..9f7285879 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/reactome_release.txt @@ -0,0 +1 @@ +88 \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt b/src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt new file mode 100644 index 000000000..ce94b07e6 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt @@ -0,0 +1,291 @@ +LOCUS NP_001355183 382 aa linear PRI 26-JUN-2020 +DEFINITION killer cell immunoglobulin-like receptor 3DS1-like precursor [Homo + sapiens]. +ACCESSION NP_001355183 XP_024308382 +VERSION NP_001355183.1 +DBSOURCE REFSEQ: accession NM_001368254.1 +KEYWORDS RefSeq; RefSeq Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..382 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="19" + /map="19" + CDS 1..382 + /gene="LOC112268355" + /coded_by="NM_001368254.1:47..1195" + /db_xref="GeneID:112268355" +ORIGIN + 1 mllmvvsmac vglflvqrag phmggqdkpf lsawpsavvp rgghvtlrch yrhrfnnfml + 61 ykedrihvpi fhgrifqegf nmspvttaha gnytcrgshp hsptgwsaps npmvimvtgn + 121 hrwcsnkkkc ccngpracre qk +// +LOCUS NP_001337906 44 aa linear PRI 01-JUL-2020 +DEFINITION putative keratin-associated protein 20-4 [Homo sapiens]. +ACCESSION NP_001337906 +VERSION NP_001337906.1 +DBSOURCE REFSEQ: accession NM_001350977.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..44 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="21" + /map="21q22.11" + CDS 1..44 + /gene="KRTAP20-4" + /gene_synonym="KAP20.4" + /coded_by="NM_001350977.1:32..166" + /db_xref="CCDS:CCDS86982.1" + /db_xref="GeneID:100151643" + /db_xref="HGNC:HGNC:34002" +ORIGIN + 1 msyyshlsgg lgcglavavt mgrtvavaey grcrhgchss ysar +// +LOCUS XP_001243796 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin specific peptidase 17 like family member 30 [Homo + sapiens]. +ACCESSION XP_001243796 XP_001130476 XP_003403824 +VERSION XP_001243796.1 +DBSOURCE REFSEQ: accession NM_001256867.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="4" + /map="4p16.1" + CDS 1..530 + /gene="USP17L30" + /coded_by="NM_001256867.1:1..1593" + /db_xref="CCDS:CCDS59471.1" + /db_xref="GeneID:728419" + /db_xref="HGNC:HGNC:44458" +ORIGIN + 1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh + 121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh + 181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql + 241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec + 301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp + 481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq +// +LOCUS NP_001229257 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin specific peptidase 17 like family member 26 [Homo + sapiens]. +ACCESSION NP_001229257 XP_001130428 XP_001721948 +VERSION NP_001229257.1 +DBSOURCE REFSEQ: accession NP_001229259.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="4" + /map="4p16.1" + CDS 1..530 + /gene="USP17L26" + /coded_by="NP_001229259.1:1..1593" + /db_xref="CCDS:CCDS59466.1" + /db_xref="GeneID:728379" + /db_xref="HGNC:HGNC:44454" +ORIGIN + 1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh + 121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh + 181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql + 241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec + 301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp + 481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq +// +LOCUS XP_001243802 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin carboxyl-terminal hydrolase 17-like protein 1 [Homo + sapiens]. +ACCESSION XP_001243802 XP_006725126 XP_011544822 +VERSION XP_001243802.1 +DBSOURCE REFSEQ: accession NM_001256873.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="8" + /map="8p23.1" + CDS 1..530 + /gene="USP17L1" + /gene_synonym="USP17L1P" + /coded_by="NM_001256873.1:1..1593" + /db_xref="CCDS:CCDS78298.1" + /db_xref="GeneID:401447" + /db_xref="HGNC:HGNC:37182" +ORIGIN + 1 mgddslylgg ewqfnhfskl tssrpdaafa eiqrtslpek splssetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy enaslqclty tlplanymls rehsqtcqrp + 121 kccmlctmqa hitwalhspg hviqpsqala agfhrgkqed vheflmftvd amkkaclpgh + 181 kqvdhhckdt tlihqifggc wrsqikclhc hgisdtfdpy ldialdiqaa qsvkqaleql + 241 vkpeelngen ayhcglclqr apasntltlh tsakvlilvl krfsdvagnk laknvqypec + 301 ldmqpymsqq ntgplvyvly avlvhagwsc hdghyfsyvk aqevqwykmd daevtvcsii + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rrakqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvgkvegtlp pnalvihqsk ykcgmknhhp + 481 eqqssllnls sttrtdqesm ntgtlaslqg rtrrakgknk hskrallvcq +// +LOCUS XP_001229255 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin specific peptidase 17 like family member 25 [Homo + sapiens]. +ACCESSION XP_001229255 XP_001130417 +VERSION XP_001229255.1 +DBSOURCE REFSEQ: accession NM_001242326.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="4" + /map="4p16.1" + CDS 1..530 + /gene="USP17L25" + /coded_by="NM_001242326.1:1..1593" + /db_xref="CCDS:CCDS59465.1" + /db_xref="GeneID:728373" + /db_xref="HGNC:HGNC:44452" +ORIGIN + 1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh + 121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh + 181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql + 241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec + 301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp + 481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq +// +LOCUS NP_001229261 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin specific peptidase 17 like family member 29 [Homo + sapiens]. +ACCESSION NP_001229261 XP_001130464 +VERSION NP_001229261.1 +DBSOURCE REFSEQ: accession NM_001242332.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="4" + /map="4p16.1" + CDS 1..530 + /gene="USP17L29" + /coded_by="NM_001242332.1:1..1593" + /db_xref="CCDS:CCDS59470.1" + /db_xref="GeneID:728405" + /db_xref="HGNC:HGNC:44457" +ORIGIN + 1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh + 121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh + 181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql + 241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec + 301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp + 481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq +// +LOCUS NP_001229260 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin specific peptidase 17 like family member 28 [Homo + sapiens]. +ACCESSION NP_001229260 XP_001130452 +VERSION NP_001229260.1 +DBSOURCE REFSEQ: accession NM_001242331.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Bos taurus (cow) + ORGANISM Bos taurus + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Bos taurus" + /db_xref="taxon:9913" + /chromosome="4" + /map="4p16.1" + CDS 1..530 + /gene="USP17L28" + /coded_by="NM_001242331.1:1..1593" + /db_xref="CCDS:CCDS59469.1" + /db_xref="GeneID:728400" + /db_xref="HGNC:HGNC:44456" +ORIGIN + 1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh + 121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh + 181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql + 241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec + 301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp + 481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq +// +LOCUS NP_001229259 530 aa linear PRI 01-JUL-2020 +DEFINITION ubiquitin specific peptidase 17 like family member 27 [Homo + sapiens]. +ACCESSION NP_001229259 XP_001130444 +VERSION NP_001229259.1 +DBSOURCE REFSEQ: accession NM_001242328.1 +KEYWORDS RefSeq; MANE Select. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..530 + /organism="Homo sapiens" + /db_xref="taxon:9606" + /chromosome="4" + /map="4p16.1" + CDS 1..530 + /gene="USP17L27" + /coded_by="NM_001242328.1:1..1593" + /db_xref="CCDS:CCDS59468.1" + /db_xref="GeneID:728393" + /db_xref="HGNC:HGNC:44455" +ORIGIN + 1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar + 61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh + 121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh + 181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql + 241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec + 301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit + 361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel + 421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp + 481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq +// diff --git a/src/python/test/xrefs/parsers/flatfiles/refseq_release.txt b/src/python/test/xrefs/parsers/flatfiles/refseq_release.txt new file mode 100644 index 000000000..ecaeeb1e8 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/refseq_release.txt @@ -0,0 +1,94 @@ +******************************************************************************** +RefSeq-release224.txt ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/ + + NCBI Reference Sequence (RefSeq) Database + + Release 224 + May 6, 2024 + + Distribution Release Notes + +Release Size: + 150742 organisms + 4379003578168 nucleotide bases + 126991769080 amino acids + 435879646 records +****************************************************************************** + +This document describes the format and content of the flat files that +comprise releases of the NCBI Reference Sequence (RefSeq) database. + +Additional information about RefSeq is available at: + +1. NCBI Bookshelf: + a) NCBI Handbook: + https://www.ncbi.nlm.nih.gov/books/NBK21091/ + b) RefSeq Help (FAQ) + https://www.ncbi.nlm.nih.gov/books/NBK50680/ + +2. RefSeq Web Sites: + RefSeq Home: https://www.ncbi.nlm.nih.gov/refseq/ + RefSeqGene Home: https://www.ncbi.nlm.nih.gov/refseq/rsg/ + +If you have any questions or comments about RefSeq, the RefSeq release files +or this document, please contact NCBI by email at: + info@ncbi.nlm.nih.gov. + +To receive announcements of future RefSeq releases and large updates please +subscribe to NCBI's refseq-announce mail list: + + send email to refseq-announce-subscribe@ncbi.nlm.nih.gov + with "subscribe" in the subject line (without quotes) + and nothing in the email body + +OR + +subscribe using the web interface at: + https://www.ncbi.nlm.nih.gov/mailman/listinfo/refseq-announce + +============================================================================= +TABLE OF CONTENTS +============================================================================= +1. INTRODUCTION + 1.1 This release + 1.2 Cutoff date + 1.3 RefSeq Project Background + 1.3.1 Sequence accessions, validation, and annotations + 1.3.2 Data assembly, curation, and collaboration + 1.3.3 Biologically non-redundant data set + 1.3.4 RefSeq and DDBJ/EMBL/GenBank comparison + 1.4 Uses and applications of the RefSeq database +2. CONTENT + 2.1 Organisms included + 2.2 Molecule Types included + 2.3 Known Problems, Redundancies, and Inconsistencies + 2.4 Release Catalog + 2.5 Changes since the previous release +3. ORGANIZATION OF DATA FILES + 3.1 FTP Site Organization + 3.2 Release Contents + 3.3 File Names and Formats + 3.4 File Sizes + 3.5 Statistics + 3.6 Release Catalog + 3.7 Removed Records + 3.8 Accession Format + 3.9 Growth of RefSeq +4. FLAT FILE ANNOTATION + 4.1 Main features of RefSeq Flat File + 4.1.1 LOCUS, DEFLINE, ACCESSION, KEYWORDS, SOURCE, ORGANISM + 4.1.2 REFERENCE, DIRECT SUBMISSION, COMMENT, PRIMARY + 4.1.3 NUCLEOTIDE FEATURE ANNOTATION (Gene, mRNA, CDS) + 4.1.4 PROTEIN FEATURE ANNOTATION + 4.2 Tracking Identifiers + 4.2.1 GeneID + 4.2.2 Transcript ID + 4.2.3 Protein ID + 4.2.4 Conserved Domain Database (CDD) ID +5. REFSEQ ADMINISTRATION + 5.1 Citing RefSeq + 5.2 RefSeq Distribution Formats + 5.3 Other Methods of Accessing RefSeq Data + 5.4 Request for Corrections and Comments + 5.5 Credits and Acknowledgements + 5.6 Disclaimer diff --git a/src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt b/src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt new file mode 100644 index 000000000..da86f9d30 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt @@ -0,0 +1,508 @@ +LOCUS NR_168385 3420 bp RNA linear PRI 13-MAY-2020 +DEFINITION Homo sapiens LOC105373289 (LOC105373289), transcript variant 6, + long non-coding RNA. +ACCESSION NR_168385 +VERSION NR_168385.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..3420 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="1" + /map="1q42.13" + gene 1..3420 + /gene="LOC105373289" + /note="LOC105373289" + /db_xref="GeneID:105373289" + ncRNA 1..3420 + /ncRNA_class="lncRNA" + /gene="LOC105373289" + /product="LOC105373289, transcript variant 6" + /db_xref="GeneID:105373289" +ORIGIN + 1 agcagggcgt ccagcggaga aggcagagga ggggagatgc gggctcctcc aggtagcgca + 61 ggagcccctc cggctgccgg agccccgcga gggcgcgagt ggagggcagg agcccgggcg + 121 gcggaggagc ggaagggatg ctgcgttgcc ttggagtgtc agggtggggg aggaaagacc + 181 aagggaccca cgtccttcgc ccccgccgcg gagtcccggg ccggcgagac ttccgcagcc + 241 tgcccagcgc cggggaccta gggctttgca ggagtccgcc cgggagctct atcagagcgg + 301 gcgtcctccc cgccgctcca aaggtggctt ggggcaggtg gggcgtcccg gagggaatgg + 361 agggaccctg cctagggaag gagggatttc gctgcctgtg gggcttcagt gctgaaccag + 421 gcagccctga gcagaccagg accgagcttc ccaaacctga ccgggaagga gccctggttg + 481 catctgggat ccacgtggtc gacagagaat cagctcgcag ctcaccaccc cagtgacttc + 541 agggcagccc accttcccct ggcgctcctc aaacgagcca gggagtggcc cctgctcaga + 601 ctcccctcct gcctcccgga ccctgcaggc ctacccgccc cagttgccct ttgccctcct + 661 gcagccttct gggggtgcta catgtctgag gcccggtctt ctgtcctgct cctcctgatg + 721 gggggtctgg gcactctccc taattcatcg cgaagactct gacacccaat gcccgtcttc + 781 aggccccggc agatgcagag aagtgggctt cacacccaca tctgcctgac ctcaggtgct + 841 ggctcctgca gtcacagccc tgagccccgg cccctccagg ctgtctcctg cttgtccagg + 901 tgggcatgag ctggtcagtt cctggccact gcccttcaga ccccatgcca ggactttggg + 961 ttgggctctg ggcatggcac tagccaggcc tgggtgcctc cttgagcagc tgagggctgg + 1021 gagggatgac aatgtaagcg gctatctggc ttcaggccca ggctggccat ctggtggcca +// +LOCUS NR_168384 3463 bp RNA linear PRI 13-MAY-2020 +DEFINITION Homo sapiens LOC105373289 (LOC105373289), transcript variant 5, + long non-coding RNA. +ACCESSION NR_168384 +VERSION NR_168384.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..3463 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="1" + /map="1q42.13" + gene 1..3463 + /gene="LOC105373289" + /note="LOC105373289" + /db_xref="GeneID:105373289" + ncRNA 1..3463 + /ncRNA_class="lncRNA" + /gene="LOC105373289" + /product="LOC105373289, transcript variant 5" + /db_xref="GeneID:105373289" +ORIGIN + 1 agcagggcgt ccagcggaga aggcagagga ggggagatgc gggctcctcc aggtagcgca + 61 ggagcccctc cggctgccgg agccccgcga gggcgcgagt ggagggcagg agcccgggcg + 121 gcggaggagc ggaagggatg ctgcgttgcc ttggagtgtc agggtggggg aggaaagacc + 181 aagggaccca cgtccttcgc ccccgccgcg gagtcccggg ccggcgagac ttccgcagcc + 241 tgcccagcgc cggggaccta gggctttgca ggagtccgcc cgggagctct atcagagcgg + 301 gcgtcctccc cgccgctcca aaggtggctt ggggcaggtg gggcgtcccg gagggaatgg + 361 agggaccctg cctagggaag tctggcacat tcctccccaa acaccggcgt cttcccatgg + 421 caggagggat ttcgctgcct gtggggcttc agtgctgaac caggcagccc tgagcagacc + 481 aggaccgagc ttcccaaacc tgaccgggaa ggagccctgg ttgcatctgg gatccacgtg + 541 gtcgacagag aatcagctcg cagctcacca ccccagtgac ttcagggcag cccaccttcc + 601 cctggcgctc ctcaaacgag ccagggagtg gcccctgctc agactcccct cctgcctccc + 661 ggaccctgca ggcctacccg ccccagttgc cctttgccct cctgcagcct tctgggggtg + 721 ctacatgtct gaggcccggt cttctgtcct gctcctcctg atggggggtc tgggcactct + 781 ccctaattca tcgcgaagac tctgacaccc aatgcccgtc ttcaggcccc ggcagatgca + 841 gagaagtggg cttcacaccc acatctgcct gacctcaggt gctggctcct gcagtcacag + 901 ccctgagccc cggcccctcc aggctgtctc ctgcttgtcc aggtgggcat gagctggtca + 961 gttcctggcc actgcccttc agaccccatg ccaggacttt gggttgggct ctgggcatgg + 1021 agaagtattc tgcttgacat acataaaaat gcataattca aaa +// +LOCUS XR_168380 3423 bp RNA linear PRI 13-MAY-2020 +DEFINITION Homo sapiens LOC105373289 (LOC105373289), transcript variant 1, + long non-coding RNA. +ACCESSION XR_168380 +VERSION XR_168380.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..3423 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="1" + /map="1q42.13" + gene 1..3423 + /gene="LOC105373289" + /note="LOC105373289" + /db_xref="GeneID:105373289" + ncRNA 1..3423 + /ncRNA_class="lncRNA" + /gene="LOC105373289" + /product="LOC105373289, transcript variant 1" + /db_xref="GeneID:105373289" +ORIGIN + 1 agtcccaggg aggagaccgc gggagaggcg gcgggaccag ggtcccggcc ttcagcggct + 61 tgctccgcac actcagggtt ccccggccct ctggcgctgg gggagttggg tcggttgtgc + 121 atgctgcatg gccggaggct cggggccaag gccacccttc cgcacccacc actctgggag + 181 gctccagagc gcggccctga gatagtgcca cactcacccc ctggaaagga ggcaaggccg + 241 ccctggacgg aggcgactcg gagtcccggg aggaaggaac ggacacaccg gcctccctgc + 301 ggaggaggga gaacgtggtc cccagtggta tcaggaagag tctggcacat tcctccccaa + 361 acaccggcgt cttcccatgg caggagggat ttcgctgcct gtggggcttc agtgctgaac + 421 caggcagccc tgagcagacc aggaccgagc ttcccaaacc tgaccgggaa ggagccctgg + 481 ttgcatctgg gatccacgtg gtcgacagag aatcagctcg cagctcacca ccccagtgac + 541 ttcagggcag cccaccttcc cctggcgctc ctcaaacgag ccagggagtg gcccctgctc + 601 agactcccct cctgcctccc ggaccctgca ggcctacccg ccccagttgc cctttgccct + 661 cctgcagcct tctgggggtg ctacatgtct gaggcccggt cttctgtcct gctcctcctg + 721 atggggggtc tgggcactct ccctaattca tcgcgaagac tctgacaccc aatgcccgtc + 781 ttcaggcccc ggcagatgca gagaagtggg cttcacaccc acatctgcct gacctcaggt + 841 gctggctcct gcagtcacag ccctgagccc cggcccctcc aggctgtctc ctgcttgtcc + 901 aggtgggcat gagctggtca gttcctggcc actgcccttc agaccccatg ccaggacttt + 961 gggttgggct ctgggcatgg cactagccag gcctgggtgc ctccttgagc agctgagggc + 1021 tgggagggat gacaatgtaa gcggctatct ggcttcaggc ccaggctggc catctggtgg + 1081 aaa +// +LOCUS NM_001242328 925 bp RNA linear PRI 14-MAY-2020 +DEFINITION Homo sapiens uncharacterized LOC107985524 (LOC107985524), long + non-coding RNA. +ACCESSION NM_001242328 XM_024452028 +VERSION NM_001242328.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..925 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="1" + /map="1p11.2" + gene 1..925 + /gene="LOC107985524" + /note="uncharacterized LOC107985524" + /db_xref="GeneID:107985524" + ncRNA 1..925 + /ncRNA_class="lncRNA" + /gene="LOC107985524" + /product="uncharacterized LOC107985524" + /db_xref="GeneID:107985524" +ORIGIN + 1 gcacacctgg ctcacggcga gtgcggagca gaaagcacta ctggcgcggg ccacagccag + 61 ccgctttcat ctgctaagac ctcacctgaa aggcgcacca gtgccctcaa ggatcctccc + 121 gcctctgcag gatgtcgagg ctcctcctcg ccgggaggag aaagcggaat cccctccctg + 181 cattctcgga cagtgccacg tcctccggct gccagcgggg cagcgccgct aggtatgtga + 241 gcttcaaagt tggaagaaat taagcaacat gctttggaat ctatggtgat ctatagaaag + 301 gcaaagtttc tggactcacc ctgactgatg gaaagacaga ctgcctgcca ggacactacc + 361 ctgctgtacc cagtcttaag tataataaag atctcatttt ttactgtcaa tgcaagccac + 421 attttcctat taggaaaatg tgaatgaaac aaagtgctct tcaagagcaa accctgaatt + 481 atactttggg ttattctctg ttcctcaaaa ggattttgca tctaactgat agtctccaaa + 541 ttgtaatgac agtatataga tagcttggtg tagacataca ggtcaataca aatggagaaa + 601 aggcaatttg ccattgaaga atatgtttgc tttaagtaaa gatcaatata ctaagaaagc + 661 tatacatatc tagacttcca aaaacagatg ggaataaact actcagcaat cagaatattc + 721 gaagatggca ctctgttcac ttccagagaa aatagttcaa aactgtatct caaagtggat + 781 ataagctatt gtactagaat tagtccctgt gtgagcattt ggcattataa aataagatgt + 841 tcccaatgaa aagatcactg gtatgtagat aataaaatgt gaaaataaaa atttaaaaat + 901 aaaacaaaaa ttatgtgata ataaa +// +LOCUS NM_028389 2517 bp RNA linear PRI 17-MAY-2020 +DEFINITION Homo sapiens ACVR2B antisense RNA 1 (ACVR2B-AS1), long non-coding + RNA. +ACCESSION NM_028389 XM_001717717 +VERSION NM_028389.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..2517 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="3" + /map="3p22.2" + gene 1..2517 + /gene="ACVR2B-AS1" + /note="ACVR2B antisense RNA 1" + /db_xref="GeneID:100128640" + /db_xref="HGNC:HGNC:44161" + ncRNA 1..2517 + /ncRNA_class="lncRNA" + /gene="ACVR2B-AS1" + /product="ACVR2B antisense RNA 1" + /db_xref="GeneID:100128640" + /db_xref="HGNC:HGNC:44161" +ORIGIN + 1 gctacactta gtgactctga gggacatgca accctccccg catgctgctg ctgctgctgc + 61 acctacaatc ctgccacccc caatgagatc tgcccacccc tcttggccgc cttccccacg + 121 ctcaggtttt cctcactctt tccctgggtt ccacgcgccc gcgtagcccg aactccgacc + 181 ctgaggctcc gcgtcccggc ccccatcgca ggggcgcctc taggaaccag aatcccgcag + 241 atgactgcac agacaagatc gtgcccccaa gttcggcgag ccgggcgccc accgcgcccc + 301 cagcccacgc ccccggagtt cctgcgccac ccacagcggc cctgagcttc aatctgcact + 361 gactgcactc ccatctcctt ggctgcagca cctgattaaa gccttcttcc ttggcaa +// +LOCUS NM_039609 79 bp RNA linear PRI 17-MAY-2020 +DEFINITION Homo sapiens microRNA 378e (MIR378E), microRNA. +ACCESSION NM_039609 +VERSION NM_039609.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..79 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="5" + /map="5q35.1" + gene 1..79 + /gene="MIR378E" + /gene_synonym="mir-378e" + /note="microRNA 378e" + /db_xref="GeneID:100616498" + /db_xref="HGNC:HGNC:41671" + /db_xref="miRBase:MI0016750" + precursor_RNA 1..79 + /gene="MIR378E" + /gene_synonym="mir-378e" + /product="microRNA 378e" + /db_xref="GeneID:100616498" + /db_xref="HGNC:HGNC:41671" + /db_xref="miRBase:MI0016750" +ORIGIN + 1 ctgactccag tgtccaggcc aggggcagac agtggacaga gaacagtgcc caagaccact + 61 ggacttggag tcaggacat +// +LOCUS NM_039939 83 bp RNA linear PRI 24-MAY-2020 +DEFINITION Homo sapiens microRNA 4779 (MIR4779), microRNA. +ACCESSION NM_039939 +VERSION NM_039939.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..83 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="2" + /map="2p11.2" + gene 1..83 + /gene="MIR4779" + /note="microRNA 4779" + /db_xref="GeneID:100616159" + /db_xref="HGNC:HGNC:41747" + /db_xref="miRBase:MI0017423" + precursor_RNA 1..83 + /gene="MIR4779" + /product="microRNA 4779" + /db_xref="GeneID:100616159" + /db_xref="HGNC:HGNC:41747" + /db_xref="miRBase:MI0017423" +ORIGIN + 1 taaatgtctt actgctttta ctgttccctc ctagagtcca ttctttactc taggagggaa + 61 tagtaaaagc agtaagacat tta +// +LOCUS NM_003928 2843 bp RNA linear PRI 31-MAY-2020 +DEFINITION Homo sapiens chitinase, acidic pseudogene 2 (CHIAP2), non-coding + RNA. +ACCESSION NM_003928 +VERSION NM_003928.2 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..2843 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="1" + /map="1p13.2" + gene 1..2843 + /gene="CHIAP2" + /note="chitinase, acidic pseudogene 2" + /pseudo + /db_xref="GeneID:149620" + /db_xref="HGNC:HGNC:44463" + misc_RNA 1..2843 + /gene="CHIAP2" + /product="chitinase, acidic pseudogene 2" + /pseudo + /db_xref="GeneID:149620" + /db_xref="HGNC:HGNC:44463" +ORIGIN + 1 gtctgctcct tgtgctgaca gctgaaatag gctctgccta ccagctgaca tgttacttca + 61 ccaactgggc ccagaaccag ccaggcctgg ggtgcttcaa gcctgatgac atcgacccct + 121 gcctctgtac ccacttgatc tacgcctttg ctggaatgca gaacaacgag atcaccacca + 181 tcgaatggga tgacatgact ctctaccaag ctttcaatgg cctgaaaaac aagtaaatga + 241 cggaaaacct gagtttcaaa tcttttaacc tttaaggaca gtttaaacaa gatcttccac + 301 agcagacttc aggctgaaat tccaaacagg ccaacaagca ggtaaattca gctttcttat + 361 tatttcaagt gcaagaatga ctctaatttt aaggggaatg gctggctcac agaagctagc + 421 tgctaactaa agcccagctc agttgccaag ggaagcttat aagtccaact actggtggac + 481 tcagttgaga acaatcttcc acttagaagc aatccaaagc tggcattgat aaagcattca + 541 gtctccttgg tcaggagatt cactcctagg gaaataattg gaactgtgga gacattgggg + 601 aaaaaaaaaa aaaaaaaaaa aaa +// +LOCUS XM_107042 80 bp RNA linear PRI 31-MAY-2020 +DEFINITION Homo sapiens microRNA 8075 (MIR8075), microRNA. +ACCESSION XM_107042 +VERSION XM_107042.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..80 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="13" + /map="13q34" + gene 1..80 + /gene="MIR8075" + /gene_synonym="hsa-mir-8075" + /note="microRNA 8075" + /db_xref="GeneID:102465874" + /db_xref="HGNC:HGNC:50172" + /db_xref="miRBase:MI0025911" + precursor_RNA 1..80 + /gene="MIR8075" + /gene_synonym="hsa-mir-8075" + /product="microRNA 8075" + /db_xref="GeneID:102465874" + /db_xref="HGNC:HGNC:50172" + /db_xref="miRBase:MI0025911" +ORIGIN + 1 ccttgctgat ggcagatgtc ggatctgcct cgcttatacg tgcccttgct gatggcagat + 61 gtcgggtctg cctcgcttat +// +LOCUS XM_120501 718 bp RNA linear PRI 31-MAY-2020 +DEFINITION Homo sapiens SYNE1 antisense RNA 1 (SYNE1-AS1), long non-coding + RNA. +ACCESSION XM_120501 +VERSION XM_120501.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..718 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="6" + /map="6q25.2" + gene 1..718 + /gene="SYNE1-AS1" + /note="SYNE1 antisense RNA 1" + /db_xref="GeneID:100505475" + /db_xref="HGNC:HGNC:40793" + ncRNA 1..718 + /ncRNA_class="lncRNA" + /gene="SYNE1-AS1" + /product="SYNE1 antisense RNA 1" + /db_xref="GeneID:100505475" + /db_xref="HGNC:HGNC:40793" +ORIGIN + 1 aaaaccacac agaggagagc taatcgggga gataactatt tgctgtgctg taggagatta + 61 ccaagagggt taactgctga gcccaggttt tcagaagggc actgaactgt tccagggcct + 121 gctccagttg agccacttgg cctgagaatt cctgctccga aagggccatc tggctgacca + 181 ggttctccaa acagctctgc gtttggaata cactgtcttc ccactgcttc cagtcggcac + 241 gcagggcctg catctccgtg tgcatgagct cacacccact ggcagttgtg ttctgtttca + 301 cttcgggagc cagcgactcc actctgctga gacggcttgc accaatctct ctggaatcta + 361 tcagctcctg taatggaata tcaccatggt aactgaagag cctgtgagtc acttggactg + 421 caagttttca actgtgtaca cagggggacc ctgtcctgcc aggaagtttt aacaagtgtg + 481 ccacaaggac ccagaatcaa ttcatctgtc cactcttaaa ttataatagc gggagttgtc + 541 atcacgagtg cctaaggctt atggcaaaaa aatcccgaga atcctcaata tcctgaaaca + 601 gactaaacag agaccctgaa atcaggcatt attcgcacac gtgaaaatgt ttagtgactc + 661 aagtgtttgc ctgtggtgga ttgctcctgt gaatgattaa acccatattt ccctcaaa +// +LOCUS NR_038942 1432 bp RNA linear PRI 31-MAY-2020 +DEFINITION Bos taurus promoter of MAT2A antisense radiation-induced + circulating long non-coding RNA (PARTICL), long non-coding RNA. +ACCESSION NR_038942 +VERSION NR_038942.1 +KEYWORDS RefSeq. +SOURCE Bos taurus (cow) + ORGANISM Bos taurus + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..1432 + /organism="Bos taurus" + /mol_type="transcribed RNA" + /db_xref="taxon:9913" + /chromosome="2" + /map="2p11.2" + gene 1..1432 + /gene="PARTICL" + /gene_synonym="PARTICLE" + /note="promoter of MAT2A antisense radiation-induced + circulating long non-coding RNA" + /db_xref="GeneID:100630918" + /db_xref="HGNC:HGNC:50886" + /db_xref="MIM:616350" + ncRNA 1..1432 + /ncRNA_class="lncRNA" + /gene="PARTICL" + /gene_synonym="PARTICLE" + /product="promoter of MAT2A antisense radiation-induced + circulating long non-coding RNA" + /db_xref="GeneID:100630918" + /db_xref="HGNC:HGNC:50886" + /db_xref="MIM:616350" +ORIGIN + 1 tgtgggagaa aaagcgactg gggcttgctg gacccgctcc cctgtgcggt gacatggcac + 61 ctcccttccg accgttggcc gggatagctt tcccggaggt cgcgtccccc agcggaggcg + 121 gcgggaatcg cggaggtttt gtgctgcggc tggggtcttt ctggccgtcc cgcctctaga + 181 tgccgggtgg cagggagccg gtggcggtcg gccatgtgaa ggtggccatc ttggccggcc + 241 agcgagggct cctcacggcc ttccttcggc gtccctgccc ggctcgttgc cggccccggg + 301 atatctgagg ggcgcaggcc caccttctag tcgtttcctg gtgaatggct ctgctgaaga + 361 tggccggaaa gcagattaat gaaggtgctg ccatcgattt aaacaatcgc cctcctgccc + 421 gctcccctgc gctaaagttt ctgagggatc ctcacccttc gtggttcgtg gacttaaaag + 481 tggaggcagc gctccagcct ttccctccag agagaaagga ggccgctccc aagtccgtcc + 541 ttgccccgtg gccttcctgt tcctttgaag ggggggggaa tcgatgtttc aatcctctgt + 601 tcaggagaat atggaacgaa catttctttt ttggtgggtg ggggctattc gttcccttga + 661 atgtgcttaa gcagatctct tgacggcgtg gaatgggctg tttcatgaag ctttcacttt + 721 aaaatgtcca cctgcgtttg tcccagtttt gcccaataaa ggaattacag ggaaaaagag + 781 cgaaacaaaa cttgagccag caaggagtat ggagtcccgt ttggagggag cccgccttgg + 841 gggcgggggt ctctgcagcc tgttctgggg ctgggcctcg gtgcaggtct ggtcgggctg + 901 gtggcctggt ggccctgagc ggagcgcgtg acaagaacgc cgggtttaat gaggttctca + 961 gggaacggcc tgctcctagc atgggatgta ctttacgctg gggaggtgaa ggagacccct + 1021 agtaacagtc tccagctgcc tactgctggc ggcagtagaa caaggtgcat tcccaagaac + 1081 aatcctctcc gtcggtgcag gaggaaaggt gggatttgct aaaggctcag tgggaaacaa + 1141 aggaagcact tgggctgtgc tgggggcatc cacaaagaag ataaagggct cctccccaga + 1201 ctcagtgacc acatccacgg tcgctcaaag caggcagcaa gaaaaggttc gaccacaaaa + 1261 gagacaatag gtgatgatat ttttattcgt tgctttttac ttttcaagct aacccttcat + 1321 gggaaagtaa ctgtataaga ctattaaatt tttttgttat tttttaaagg agaatgccaa + 1381 tttattaact tacaatgtgt aataaaattg tcaactggaa aaaaaaaaaa aa +// +LOCUS NY_108110 808 bp RNA linear PRI 31-MAY-2020 +DEFINITION Homo sapiens B4GALT1 antisense RNA 1 (B4GALT1-AS1), transcript + variant 3, long non-coding RNA. +ACCESSION NY_108110 +VERSION NY_108110.1 +KEYWORDS RefSeq. +SOURCE Homo sapiens (human) + ORGANISM Homo sapiens + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; + Catarrhini; Hominidae; Homo. + source 1..808 + /organism="Homo sapiens" + /mol_type="transcribed RNA" + /db_xref="taxon:9606" + /chromosome="9" + /map="9p21.1" + gene 1..808 + /gene="B4GALT1-AS1" + /note="B4GALT1 antisense RNA 1" + /db_xref="GeneID:101929639" + /db_xref="HGNC:HGNC:49910" + ncRNA 1..808 + /ncRNA_class="lncRNA" + /gene="B4GALT1-AS1" + /product="B4GALT1 antisense RNA 1, transcript variant 3" + /db_xref="GeneID:101929639" + /db_xref="HGNC:HGNC:49910" +ORIGIN + 1 ggaggactgc ccgatggcgg cggcactgtt cgagccgccc tgcagcggtg tggagactcc + 61 gaccagttgg ggcaggcggc tcaggtcgcg gccagccagg ggttgggaaa ctagtttcca + 121 tggccaaacc tagcccaccg tctgttttgt ttcctgttct tcccaacctg cgctatggac + 181 ttcatcagat ttcagcatca gagagaatat ggaaggacat cgaccctaac ttcatccagt + 241 gaggatttcc acacaccata cactctctga gagttctctt ggctttgtgt gcacacctcc + 301 agtgacaggg agctcgctat gtcatgaggc agcctgctcc cttgtggcta tcactgaacc + 361 aactattaag ccttcttata caaacagtcc ctgacttact gttagactta caactttttg + 421 attttacagt ggtgcaaaag caatatgcat tccatagaaa ctgtatttca aattttgaat + 481 tttgatcttt tccaggctag caacatatga agaccaacct tctattttta aaataggctt + 541 tgtgttagat gcttttgccc aactataggt taatgtaaat gttctgagaa tatttgagga + 601 aggctaggct aaactgtgat ctttgggagc ttagatatgt taagtgcatt ttcaacttac + 661 aatattttca acttacgatg agtttattgg gatgaagccc gttgtaagta aaggagcatc + 721 tgcattaagc taaaatccat gtctataact tcctcagtaa tatcaagttt gttccttgga + 781 gccagagaat aaactgaatc cttttcca +// diff --git a/src/python/test/xrefs/parsers/flatfiles/rfam.txt b/src/python/test/xrefs/parsers/flatfiles/rfam.txt new file mode 100644 index 000000000..0dcfad606 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/rfam.txt @@ -0,0 +1,381 @@ +# STOCKHOLM 1.0 + +#=GF AC RF00001 +#=GF ID 5S_rRNA +#=GF DE 5S ribosomal RNA +#=GF AU Griffiths-Jones SR; 0000-0001-6043-807X +#=GF AU Mifsud W; 0000-0002-9805-6461 +#=GF AU Gardner PP; 0000-0002-7808-1213 +#=GF SE Szymanski et al, 5S ribosomal database, PMID:11752286 +#=GF SS Published; PMID:11283358 +#=GF GA 38.00 +#=GF TC 38.00 +#=GF NC 37.90 +#=GF TP Gene; rRNA; +#=GF BM cmbuild -F CM SEED +#=GF CB cmcalibrate --mpi CM +#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -T 24.99 -Z 549862.597050 CM SEQDB +#=GF CL CL00113 +#=GF DR URL; http://rose.man.poznan.pl/5SData/; +#=GF DR SO; 0000652; rRNA_5S; +#=GF DR GO; 0003735; structural constituent of ribosome; +#=GF DR GO; 0005840; ribosome; +#=GF RN [1] +#=GF RM 11752286 +#=GF RT 5S Ribosomal RNA Database. +#=GF RA Szymanski M, Barciszewska MZ, Erdmann VA, Barciszewski J +#=GF RL Nucleic Acids Res 2002;30:176-178. +#=GF RN [2] +#=GF RM 11283358 +#=GF RT Crystal structure of the ribosome at 5.5 A resolution. +#=GF RA Yusupov MM, Yusupova GZ, Baucom A, Lieberman K, Earnest TN, Cate JH, +#=GF RA Noller HF +#=GF RL Science 2001;292:883-896. +#=GF RN [3] +#=GF RM 10926492 +#=GF RT The role of the central zinc fingers of transcription factor IIIA in +#=GF RT binding to 5 S RNA. +#=GF RA Searles MA, Lu D, Klug A +#=GF RL J Mol Biol 2000;301:47-60. +#=GF RN [4] +#=GF RM 23838690 +#=GF RT Systematic analysis and evolution of 5S ribosomal DNA in metazoans. +#=GF RA Vierna J, Wehner S, Honer zu Siederdissen C, Martinez-Lage A, Marz M +#=GF RL Heredity (Edinb). 2013;111:410-421. +#=GF CC 5S ribosomal RNA (5S rRNA) is a component of the large ribosomal subunit +#=GF CC in both prokaryotes and eukaryotes. In eukaryotes, it is synthesised by +#=GF CC RNA polymerase III (the other eukaryotic rRNAs are cleaved from a 45S +#=GF CC precursor synthesised by RNA polymerase I). In Xenopus oocytes, it has +#=GF CC been shown that fingers 4-7 of the nine-zinc finger transcription factor +#=GF CC TFIIIA can bind to the central region of 5S RNA. Thus, in addition to +#=GF CC positively regulating 5S rRNA transcription, TFIIIA also stabilises 5S +#=GF CC rRNA until it is required for transcription. +#=GF WK 5S_ribosomal_RNA +#=GF SQ 712 + + + +X01556.1/3-118 --CUUGAC-GA-U-C-AU-AGA----GC-G-U-U-G---GA----------A-CC-A----------C--CUG----A-UC----CCUUCC------CGA-ACUCA-GA-AGUGAA-A-----------------CGACGCA-U-C--G---CC--GAUG----GUAGUGUG----GGGUUUC-C-CCAUG-UGA---G---AGUA----GG-U-CA-UC--G-UCAAGC +X55260.1/3-119 --UACGGC-GG-C-C-AU-AGC----GA-A-G-G-G---GA----------A-AU-A----------C--CCG----G-UC----CCAUCC------CGA-ACCCG-GA-AGUCAA-G-----------------CCCUUCA-G-C--G---CC--GAUG----GUACUGCAA---CCGAGAG-G-CUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC +M16174.1/3-119 --UACGGC-GG-C-C-AU-AGC----GG-C-G-G-G---GA----------A-AC-A----------C--CCG----G-UC----CCAUGC------CGA-ACCCG-GA-AGUUAA-G-----------------CCUGCCA-G-C--G---CC--GAUG----GUACUGCAA---CCGAGAG-G-CUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC +X55267.1/3-119 --UACGGC-GG-C-C-AU-AGC----GG-A-G-G-G---GA----------A-AC-G----------C--CCG----G-UC----CCAUUC------CGA-ACCCG-GA-AGCUAA-G-----------------CCCUCCA-G-C--G---CC--GAUG----GUACUGCAC---CCGAGAG-A-GUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC +M16172.1/3-119 --UACGGC-GG-U-C-AU-AGC----GA-A-G-G-G---GA----------A-AC-G----------C--CCG----G-UC----CCAUUC------CGA-ACCCG-GA-AGCUAA-G-----------------CCCUUCA-G-C--G---CC--GAUG----GUACUGCAC---UCGCCAG-G-GUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC +#=GC SS_cons ..((((((.((.(.,.,,.,<<....-<.<.<.<.<...--..........-.<<.-..........-..<<<....<.<<....______.......>>.-->>>.>-.->>---.-.................>>>>>--.>.>..<...<<..-<<---..-<-<<----..-<<____>.>.-----.>>-...>..-->>-....>>.>..).))..).))))): +#=GC RF ..gccuGc.gg.c.C.AU.Acc....ag.c.g.c.g...aA..........a.gc.A..........C..cgG....a.uC....CCAUCc.......Ga.ACuCc.gA.AguUAA.G.................cgcgcUu.g.g..g...Cc..agggUA..GUAcuagGa..UGgGuGAc.C.uCcUG.ggA...A..gacca....gG.u..g.cc..g.Caggcc +// +# STOCKHOLM 1.0 + +#=GF AC RF00002 +#=GF ID 5_8S_rRNA +#=GF DE 5.8S ribosomal RNA +#=GF AU Griffiths-Jones SR; 0000-0001-6043-807X +#=GF AU Mifsud W; 0000-0002-9805-6461 +#=GF SE Wuyts et al, European LSU rRNA database, PMID:11125083 +#=GF SS Published; PMID:11125083 +#=GF GA 42.00 +#=GF TC 42.00 +#=GF NC 41.90 +#=GF TP Gene; rRNA; +#=GF BM cmbuild -F CM SEED +#=GF CB cmcalibrate --mpi CM +#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -T 19.62 -Z 742849.287494 CM SEQDB +#=GF CL CL00112 +#=GF DR URL; http://rrna.uia.ac.be/lsu/index.html; +#=GF DR SO; 0000375; rRNA_5_8S; +#=GF DR GO; 0003735; structural constituent of ribosome; +#=GF DR GO; 0005840; ribosome; +#=GF RN [1] +#=GF RM 11125083 +#=GF RT The European Large Subunit Ribosomal RNA Database. +#=GF RA Wuyts J, De Rijk P, Van de Peer Y, Winkelmans T, De Wachter R +#=GF RL Nucleic Acids Res 2001;29:175-177. +#=GF RN [2] +#=GF RM 9108162 +#=GF RT Role of the 5.8S rRNA in ribosome translocation. +#=GF RA Abou Elela S, Nazar RN +#=GF RL Nucleic Acids Res 1997;25:1788-1794. +#=GF RN [3] +#=GF RM 9154813 +#=GF RT Cytoplasmic p53 polypeptide is associated with ribosomes. +#=GF RA Fontoura BM, Atienza CA, Sorokina EA, Morimoto T, Carroll RB +#=GF RL Mol Cell Biol 1997;17:3146-3154. +#=GF CC 5.8S ribosomal RNA (5.8S rRNA) is a component of the large subunit of the +#=GF CC eukaryotic ribosome. It is transcribed by RNA polymerase I as part of the +#=GF CC 45S precursor that also contains 18S and 28S rRNA. Functionally, it is +#=GF CC thought that 5.8S rRNA may be involved in ribosome translocation [2]. It +#=GF CC is also known to form covalent linkage to the p53 tumour suppressor +#=GF CC protein [3]. 5.8S rRNA is also found in archaea. +#=GF WK 5.8S_ribosomal_RNA +#=GF SQ 61 + + +L78065.1/3758-3910 AACCCUAGGCAGGGGAUCACUCGGC-UCAUGGAUCGAUGAAGACCGCAGC-UAAA-UG-CGCGUCAGAAUGU-----GAACUG--CAG-GAC---------ACAU-GAACACCGACA--------CGUUGAACG-AUAUUGC-GCAUUGCAC--------GACUC------AGUGCG--AUGUACACA---UUU---UUGAGUGCCC +AB011808.1/289-442 AACUUUCAACAACGGAUCUCUUGGU-UCUCGCAUCGAUGAAGAACGCAGC-GAAA-UG-CGAUACGUAAUGU-----GAAUUG--CAG-AAUU--------CCGUGAAUCAUCGAAU--------CUUUGAACGCACAUUGC-GCCCCUUGG--------UAUU--------CCAGG--GGGCAUGCC---UGU---UUGAGCGUCA +X60705.1/112-265 AACUUUUAACAACGGAUCUCUUGGC-UCUAGCAUCGAUGAAGAACGCAGC-GAAA-CG-CGAUAUGUAGUGU-----GAAUUG--CAG-AAUU--------CAGUGAAUCAUCGAAU--------CUUUGAACGCACAUGGC-GCCUUCCAG--------UAUC--------CUGGG--AGGCAUGCC---UGU---CCGAGCGUCG +M36008.1/959-1112 GACUCUUAGCGGUGGAUCACUCGGC-UCGUGCGUCGAUGAAGAACGCAGC-UAGC-UG-CGAGAAUUAGUGU-----GAAUUG--CAG-GAC---------ACAUUGAUCAUCGACA--------CUUCGAACGCACCUUGC-GGCCCCGGG--------UUCC-------UCCCGG--GGCUACGCC---UGU---CUGAGGGUCG +#=GC SS_cons :::::::::::::::::::::::::.::::::::::::::((((<<<<__.____.>>.>>,,,,,,,<<<-.....---<<_..___.>>--........-------->>>,,,,,........,)))),,,<<<___>>>.<<<<<<<<<........____........>>>>>..>>>>:::::...:::...:::::::::: +#=GC RF AACUuUuAgCgAUGGAUguCUuGGc.UCucGcaUCGAUGAAgaaCGCaGC.gAAa.uG.CGAUAcgUaauGU.....GAAuuG..CAG.aauu........ccgUGAAUCauCGAau........cuuuGAACGCaaaUuGC.gcccccggg........Uuuu........cccgg..gggcAuguc...UGu...uUgAGuGUCu +// +# STOCKHOLM 1.0 + +#=GF AC RF00003 +#=GF ID U1 +#=GF DE U1 spliceosomal RNA +#=GF AU Griffiths-Jones SR; 0000-0001-6043-807X +#=GF AU Mifsud W; 0000-0002-9805-6461 +#=GF AU Moxon SJ; 0000-0003-4644-1816 +#=GF AU Ontiveros-Palacios N; 0000-0001-8457-4455 +#=GF SE Zwieb C, The uRNA database, PMID:9016512 +#=GF SS Published; PMID:2405391 +#=GF GA 66.00 +#=GF TC 66.00 +#=GF NC 65.90 +#=GF TP Gene; snRNA; splicing; +#=GF BM cmbuild -F CM SEED +#=GF CB cmcalibrate --mpi CM +#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -T 30.00 -Z 742849.287494 CM SEQDB +#=GF CL CL00005 +#=GF DR SO; 0000391; U1_snRNA; +#=GF DR GO; 0000395; mRNA 5'-splice site recognition; +#=GF DR GO; 0030627; pre-mRNA 5'-splice site binding; +#=GF DR GO; 0005685; U1 snRNP; +#=GF RN [1] +#=GF RM 9016512 +#=GF RT The uRNA database. +#=GF RA Zwieb C +#=GF RL Nucleic Acids Res 1997;25:102-103. +#=GF RN [2] +#=GF RM 2405391 +#=GF RT Saccharomyces cerevisiae U1 small nuclear RNA secondary structure contains +#=GF RT both universal and yeast-specific domains. +#=GF RA Kretzner L, Krol A, Rosbash M +#=GF RL Proc Natl Acad Sci U S A 1990;87:851-855. +#=GF RN [3] +#=GF RM 7984237 +#=GF RT Crystal structure at 1.92 A resolution of the RNA-binding domain of the +#=GF RT U1A spliceosomal protein complexed with an RNA hairpin. +#=GF RA Oubridge C, Ito N, Evans PR, Teo CH, Nagai K +#=GF RL Nature 1994;372:432-438. +#=GF RN [4] +#=GF RM 11297556 +#=GF RT Two functionally distinct steps mediate high affinity binding of U1A +#=GF RT protein to U1 hairpin II RNA. +#=GF RA Katsamba PS, Myszka DG, Laird-Offringa IA +#=GF RL J Biol Chem 2001;276:21476-21481. +#=GF RN [5] +#=GF RM 19325628 +#=GF RT Crystal structure of human spliceosomal U1 snRNP at 5.5 A resolution. +#=GF RA Pomeranz Krummel DA, Oubridge C, Leung AK, Li J, Nagai K +#=GF RL Nature. 2009;458:475-480. +#=GF RN [6] +#=GF RM 30975767 +#=GF RT Mechanism of 5' splice site transfer for human spliceosome activation. +#=GF RA Charenton C, Wilkinson ME, Nagai K +#=GF RL Science. 2019;364:362-367. +#=GF CC U1 is a small nuclear RNA (snRNA) component of the spliceosome (involved +#=GF CC in pre-mRNA splicing). Its 5' end forms complementary base pairs with the +#=GF CC 5' splice junction, thus defining the 5' donor site of an intron. There +#=GF CC are significant differences in sequence and secondary structure between +#=GF CC metazoan and yeast U1 snRNAs, the latter being much longer (568 +#=GF CC nucleotides as compared to 164 nucleotides in human). Nevertheless, +#=GF CC secondary structure predictions suggest that all U1 snRNAs share a 'common +#=GF CC core' consisting of helices I, II, the proximal region of III, and IV [1]. +#=GF CC This family does not contain the larger yeast sequences. The structure of +#=GF CC U1 spliceosomal RNA has been reported in [5,6]. It present 4 Stem loops +#=GF CC (SL1, SL2, SL3, and SL4) and a region call Helix H. SL1, SL2, and SL3 are +#=GF CC join for the Helix H, forming a four-helix junction that are separated of +#=GF CC SL4. U1 snRNA is important in the precatalytic spliceosome, where the 5' +#=GF CC splice site (5'SS) of the pre-mRNA is recognized by pairing with 5'-U1 +#=GF CC snRNA. Where spliceosome activation is initiated by a disruption of the +#=GF CC 5′SS–U1 snRNP interaction by the DEAD-box helicase Prp28 [6]. The +#=GF CC structure of U1 small nucleolar RNA was reported in PDB:6QX9 +#=GF WK U1_spliceosomal_RNA +#=GF SQ 100 + + +X06810.1/261-421 AUACUUACCUGGACGG-GGUCA-AUGG---AUGAUCAA-UAAG-GUCCA-UGGCCU---AGG-GAAGUAACCUCCAUU-----GCACUU-AGGAGG-GGUGCUUU---------CCUA-AGGUCUGCCCAA---GUGG--CAG-AGCCU-ACGUCAUAAUUUGUGGUAG--UGGGGG--CUUGCGUU--CGCGCAGCCCCUUC +X14417.1/177-340 AUACUUACCUGGACGG-GGUCA-AUGG---GUAAUCAA-GAAG-UUCCA-UGGCCU---AGG-UUGGUGACCUCCAUU-----GCACUA-AGGAGG-GGUGCUUG---------CCUA-AGGUCGACCCAA---GUGG--UUG-AGCCU-ACGUCAUAAUUUGUUGUUGCAGAGGGG-GCCUGUGUU--CGCGCAGCCCCUAC +X06809.1/232-392 AUACUUACCUGGACGG-GGUCA-AUGG---AUGAUCAA-GAAG-GUCCA-UGGCCU---AGG-GAAGUAACCUCCAUU-----GCACUG-AGGAGG-GGUGCCUU---------UCUA-AGGUCUGUCCAA---GUGA--CAG-AGCCU-ACGUCAUAAUUUGUGGUAG--UGGGGG--CCUGCGUU--CGCGCGGCCCCUUU +URS000032B6B6_9606/1-164 AUACUUACCUGGCAGG-GGAGA-UACC---AUGAUCAC-GAAG-GUGGU-UUUCCCA--GGG-CGAGGCUUAUCCAUU-----GCACUC-CGG-AU-GUGCUGAC---------CCCUGCGAUUUCCCCAAA-UGUGG--GAA-ACUCG-ACUGCAUAAUUUGUGGUAG--UGGGGG--ACUGCGUU--CGCGCUUUCCCCUG +#=GR URS000032B6B6_9606/1-164 6QX9_1_SS ...........((((..(((((...((...((............))))..))))).(..(((..(.(((.(((((................)).)).).)))..).........)))).(((((((((.(.....)))..))).).)))..)))).................(((((..(..(((......)))..)))))). +#=GC SS_cons :::::::::::((((<.<<<<<.-<<<...<<______.____.>>>>>.>>>>>><..<<<.-<-<<<-<<<<<___.....______._>>->>.>->>>-->.........>>>>,<<<<<<-<<____..__>>..->>->->>>.,))))--------------..-<<<<<.-<-<<<<__..__>>>>->>>>>>: +#=GC RF AUACUUACCUGGccgg.GGgca.accg...gcGAUCAa.GAAG.gccgg.ugcCCca..ggg.ugaggcuccccCAUU.....GCACUu.cGgagg.gugccgac.........cccuGcGguucccCCaAA..GUGg..ugaaaccCg.AcggCAUAAUUUgUGgUAG..ucgGGG.gaccGcgUU..cGcgCguuCCCcgc +#=GC RNA_structural_elements ==========[5pH][================Stem=loop=1============][=========================Stem=loop==========================]=[=======Stem=loop=3==========]==[3pH]================[==========Stem=loop=4========] +// +# STOCKHOLM 1.0 + +#=GF ID U2 +#=GF DE U2 spliceosomal RNA +#=GF AU Griffiths-Jones SR; 0000-0001-6043-807X +#=GF AU Mifsud W; 0000-0002-9805-6461 +#=GF AU Gardner PP; 0000-0002-7808-1213 +#=GF SE The uRNA database, PMID:9016512; Griffiths-Jones SR; PMID:18390578 +#=GF SS Published; PMID:2339054; Griffiths-Jones SR +#=GF GA 46.00 +#=GF TC 46.00 +#=GF NC 45.90 +#=GF TP Gene; snRNA; splicing; +#=GF BM cmbuild -F CM SEED +#=GF CB cmcalibrate --mpi CM +#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -E 1000 -Z 549862.597050 CM SEQDB +#=GF CL CL00006 +#=GF DR SO; 0000392; U2_snRNA; +#=GF DR GO; 0000348; mRNA branch site recognition; +#=GF DR GO; 0045131; pre-mRNA branch point binding; +#=GF DR GO; 0005686; U2 snRNP; +#=GF RN [1] +#=GF RM 9016512 +#=GF RT The uRNA database. +#=GF RA Zwieb C +#=GF RL Nucleic Acids Res 1997;25:102-103. +#=GF RN [2] +#=GF RM 11424937 +#=GF RT A conserved pseudouridine modification in eukaryotic U2 snRNA induces a +#=GF RT change in branch-site architecture. +#=GF RA Newby MI, Greenbaum NL +#=GF RL RNA 2001;7:833-845. +#=GF RN [3] +#=GF RM 11350032 +#=GF RT Crystal structure of a model branchpoint-U2 snRNA duplex containing bulged +#=GF RT adenosines. +#=GF RA Berglund JA, Rosbash M, Schultz SC +#=GF RL RNA 2001;7:682-691. +#=GF RN [4] +#=GF RM 2339054 +#=GF RT The spliceosomal snRNAs of Caenorhabditis elegans. +#=GF RA Thomas J, Lea K, Zucker-Aprison E, Blumenthal T +#=GF RL Nucleic Acids Res 1990;18:2633-2642. +#=GF CC U2 is a small nuclear RNA (snRNA) component of the spliceosome (involved +#=GF CC in pre-mRNA splicing). Complementary binding between U2 snRNA (in an area +#=GF CC lying towards the 5' end but 3' to hairpin I) and the branchpoint sequence +#=GF CC (BPS) of the intron results the bulging out of an unpaired adenosine, on +#=GF CC the BPS, which initiates a nucleophilic attack at the intronic 5' splice +#=GF CC site, thus starting the first of two transesterification reactions that +#=GF CC mediate splicing. +#=GF WK U2_spliceosomal_RNA +#=GF SQ 208 + + +AALT01209640.1/567-377 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAU--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCAGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGAAUUA------GGAG-UUGGAAUA-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAC------UUCCAGG--AA--------CGGUGCACCCCCU +AAFR03033875.1/20528-20718 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGAAACAG------GGAG-UCGGAAUA-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAC------UUCCAGG--AA--------CGGUGCACUUCCC +AAIY01044029.1/787-597 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAGAUCUGAUA--UGUCCUCU-AUCUGAGGACAA------------CAUAUUAAA------CGUAUUUUUGGAAAUA------GGAG-UUGGACCA-------GGAGC---U-----U----G-CUCCA-------UCCA-CUCCAC-GCAUCAG---CCUGGUAUUGCAGUAU------UUCCAGG--AA--------UGGUGCAGCCCCU +AAZO01007389.1/15370-15178 AUCGCU-UCU----CGGCC--UUA-U-GGCUAAGAUCAAA-GUGUAGUAUCUGUUCUUAUCAGCUUAAUAUCUGAUA--CGACCCUC-AUUGAGGGUCCAG-----------AAUAUUAAA------CUGAUUUUUGGAAACG------GAUG-GAGUGUUA-------GGGGC---U-----U----G-CUCCA-------CCUC-CGUCAC-GGGUUGG---CUCGGCAUUGCAGUAC------AGCCGAG--AU--------CGGCCCACCCUUA +AAYZ01695118.1/310-500 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGACGCU------GGAG-UUGGACUA-------GGAGC---U-----U----G-CUCCA-------UCCA-CUCCGC-GCAUCGA---CCUGGUAUUGCAGUAC------UUCCAGG--AC--------CGGUGCACCCCGU +AAHX01044404.1/26102-26292 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUAUUAUCAGUUUAAUAUCUGAUA--UGUCCUCU-AUCUGAGGACAA------------UAUAUUAAA------UGAAUUUUUGGUACUA------GGAG-UUGGAAUA-------GGAGC---U-----U----G-CUCCA-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAU------UUCCAGG--AA--------UGGUGCACUCCUC +AACN010750078.1/657-848 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGAGCAG------GGAGAUGGAAUAG-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCGC-GCAUCGA---CCUGGUAUUGCAGUAC------CUCCAGG--AA--------CGGUGCACCCCCU +AAFN02000024.1/475809-475596 AUC----UCU----UUGCC--AUU-U-GGCUUAGAUCCA--GUGUAGUAUCUGUUCUUUUCAGUGUAACAGCUGAAA---UGUCAUC-AUUGAUGACUUUACAUUAUGUUACAAAUUUAUA------CUUAUUUUUGGAUAUUGG----GUAG-AUUGAUGUAUUAAAGUGGGC---U-----U----G-CUCAC----AGUCUUU-CUACAU-AGUGUCG---UUGCCA-CUGUACUUUUAUU--UUGGCUU--CU--------GACGCAAAUUCUU +K00034.1/420-610 AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGAACUA------GGAG-UUGGAAUA-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAC------CUCCAGG--AA--------CGGUGCACCCCCU +ABDG02000029.1/618164-617972 UUAGCUCUCU----UUGCC--UUU-U-GGCUUAGAUCAA--GUGUAGUAUCUGUUCUUUUCAGUUUAAUCUCUGAAA---GGUCUCU-AA-GGAGACCAAUC----------GUGAUUAUU------CUUAUUUUUGUCCUCA------GGGC-GGUCUCCUC------UGUGC---U-----U----G-CACAU-----GAUUCC-GCCCAC-AGUGUC----CCUGGUAUUACACUGC------CUCCAGG---C--------GACGCGAACACUU +AP004871.3/124344-124540 AUACCUUUCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--UGUGGGCC-AUGUGCCCACUUC-----------GAUAUUAAA------UUUAUUUUUUGUGGGG------GAGG-GCCCACUACA-----GUGGC---U-----U----G-CCACU------GGGGU-CCUCGC-GUGUCGC---CCAGGCGUUGCACUAC------AGCCUGG-GCC--------UGGCGCACCCCAA +#=GC SS_cons ::::::.<<<....-<<<<..___._.>>>>->>>,,,,..,,,,,,,,,,,,,,,,<<<<<<________>>>>>>..,<<<<<<<.___>>>>>>>,,,...........,,,,,,,,,......,,,,,,,,,,,,,,,,......<<<<.<<<<----.......<<<<<..._....._....>.>>>>-......->>>>.>>>>,,.<<<<<<-...<<<<<<__________......_>>>>>>..--........>>>>>>::::::: +#=GC RF AUacCU.UCu....cgGCc..UUU.U.gGCuaaGAUCAA..GUGUAGUAUCUGUUCUUauCAGUuUAAuAuCUGauA..uggccccc.Auugggggccaau...........uauaUUAaa......uuaAUUUUUggaacua......Gugg.gggcauuu.......uggGC...U.....U....G.Cccau......ugccc.ccaCac.ggguuga...ccuggcaUUGCAcUac......cgccagg..uu........cagcccAcccuuu +// +# STOCKHOLM 1.0 + +#=GF AC RF00005 +#=GF ID tRNA +#=GF DE tRNA +#=GF AU Eddy SR; 0000-0001-6676-4706 +#=GF AU Griffiths-Jones SR; 0000-0001-6043-807X +#=GF AU Mifsud W; 0000-0002-9805-6461 +#=GF SE Eddy SR +#=GF SS Published; PMID:8256282 +#=GF GA 29.00 +#=GF TC 29.00 +#=GF NC 28.90 +#=GF TP Gene; tRNA; +#=GF BM cmbuild -F CM SEED +#=GF CB cmcalibrate --mpi CM +#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -T 22.00 -Z 549862.597050 CM SEQDB +#=GF CL CL00001 +#=GF DR SO; 0000253; tRNA; +#=GF DR GO; 0030533; triplet codon-amino acid adaptor activity; +#=GF RN [1] +#=GF RM 8256282 +#=GF RT The tertiary structure of tRNA and the development of the genetic code. +#=GF RA Hou YM +#=GF RL Trends Biochem Sci 1993;18:362-364. +#=GF RN [2] +#=GF RM 9023104 +#=GF RT tRNAscan-SE: a program for improved detection of transfer RNA genes in +#=GF RT genomic sequence. +#=GF RA Lowe TM, Eddy SR +#=GF RL Nucleic Acids Res 1997;25:955-964. +#=GF CC Transfer RNA (tRNA) molecules are approximately 80 nucleotides in length. +#=GF CC Their secondary structure includes four short double-helical elements and +#=GF CC three loops (D, anti-codon, and T loops). Further hydrogen bonds mediate +#=GF CC the characteristic L-shaped molecular structure. tRNAs have two regions of +#=GF CC fundamental functional importance: the anti-codon, which is responsible +#=GF CC for specific mRNA codon recognition, and the 3' end, to which the tRNAs +#=GF CC corresponding amino acid is attached (by aminoacyl-tRNA synthetases). +#=GF CC tRNAs cope with the degeneracy of the genetic code in two manners: having +#=GF CC more than one tRNA (with a specific anti-codon) for a particular amino +#=GF CC acid; and 'wobble' base-pairing, i.e. permitting non-standard base-pairing +#=GF CC at the 3rd anti-codon position. +#=GF WK Transfer_RNA +#=GF SQ 954 + + +AB003409.1/96-167 GGGCCCAU-A-GCUCAGU---GGU---AGAGUG-C-CUCCU-UUGCAAGGAG-GAU------------------------GC--CCUG-GGU-UCG-AA--UCCCA-G-UGGGUCC-A +CP000660.1/704452-704523 GGGCCGGU-A-GUCUAGC---GGA---AGGAUG-C-CCGCC-UCGCGCGCGG-GAG------------------------AU--CCCG-GGU-UCG-AA--UCCCG-G-CCGGUCC-A +X63776.1/648-721 CGGCACGU-A-GCGCAGCC-UGGU---AGCGCA-C-CGUCA-UGGGGUGUCG-GGG------------------------GU--CGGA-GGU-UCA-AA--UCCUC-U-CGUGCCG-A +DQ927305.1/46859-46925 GCUGCUUG-A-AUGGU-----------UUCAGU-G-UGGGC-UCAUUUCCCA-UUA------------------------CU--CAAA-AGU-UCG-AU--UCUUU-U-AAGCGGC-C +K01561.1/1-74 GCGUUCAU-A-GCUCAGUU--GGUU--AGAGCA-C-CACCU-UGACAUGGUG-GGG------------------------GU--CGUU-GGU-UCG-AG--UCCAA-U-UGAACGC-A +X17321.1/66-138 GGGUGAUU-A-GCUCAGCU--GGG---AGAGCA-C-CUCCC-UUACAAGGAG-GGG------------------------GU--CGGC-GGU-UCG-AU--CCCGU-C-AUCACCC-A +AY632242.1/10-80 CAUUAGAU-G-ACUGAA----AG----CAAGUA-C-UGGUC-UCUUAAACCA-UUU------------------------UA--UAGU-AAA-UUA-GC-AUUUAC-U-UCUAAUG-A +J01404.1/5140-5204 AUCUAUAU-A-GUAUAAA---------AGUAUA-U-UUGAC-UUCCAAUCAU-AAG------------------------G---UCUA-UU--AAU-U----AAUA-G-UAUAGAU-A +EU273712.1/5242-5176 AGCCUUAA-A-GUGUUU----------AUCAUG-U-CGAAU-UGCAAAUUCG-AAG------------------------G---UGUA-GAG-AAU-C-C-CUCUA-C-UAAGGCU-U +EU255777.1/1590-1519 UGGGGCGU-G-GCCAAGU---GGU---AAGGCA-A-CGGGU-UUUGGUCCCG-CUA------------------------UU--CGGA-GGU-UCG-AA--UCCUU-C-CGUCCCA-G +X13994.1/40-129 GAAGAUCG-U-CGUCUCC---GGUG--AGGCGG-C-UGGAC-UUCAAAUCCA-GU--UGG-GGCCGCCA--GCGGUCCCG----GGCA-GGU-UCG-AC--UCCUG-U-GAUCUUCC- +M21681.1/156-228 CGCGGGGU-G-GAGCAGCC-UGGU---AGCUCG-U-CGG-C-UCAUAACCCG-AAG------------------------GU--CGUC-GGU-UCA-AA--UCCGG-C-CCCCGCA-A +K00197.1/1-71 GCGGGCGU-A-GUUCAAU---GGU---AGAACG-A-GAGCU-UCCCAAGCUC-UAU------------------------A---CGAG-GGU-UCG-AU--UCCCU-U-CGCCCGC-U +X51770.1/245-317 GGCCGCGU-G-GCGCAAU---GGAU--AACGCG-U-CUGCC-UACGGAGCAG-AAG------------------------AU--UGCA-GGU-UCG-AA--UCCUG-C-CGUGGUC-G +K01856.1/1-82 GGAGAGAU-G-GCCGAGC---GGUCU-AAGGCG-C-UGGUU-UAAGGCACCA-GU--CCC-----UUC---G-----GGGG---CGUG-GGU-UCG-AA--UCCCA-C-UCUCUUC-A +Z83129.1/22044-21973 UCCUCGGU-A-GUAUAGU---GGUG--AGUAUC-C-GCGUC-UGUCACAUGC-GAG------------------------A---CCCG-GGU-UCA-AU--UCCCG-G-CCGGGGA-G +M25476.1/51-122 AGCAGCGU-G-GCGCAGU---GGA---AGCGUG-C-UGGGC-CCAUAACCCA-GAG------------------------GU--CGGU-GGA-UCG-AA--ACCAC-U-CGCUGCU-A +X54124.1/910-981 GACUGCUU-G-GCGCAAU---GGU---AGCGCG-U-UCGAC-UCCAGAUCGA-AAG------------------------GU--UGGG-CGU-UCG-AU--CCGCU-C-AGUGGUC-A +K01390.1/442-514 GAGCCAUU-A-GCUCAGUU--GGU---AGAGCA-U-CUGAC-UUUUAAUCAG-AGG------------------------GU--CGAA-GGU-UCG-AG--UCCUU-C-AUGGCUC-A +#=GC SS_cons (((((((,.,.<<<<___...___..._>>>>,.<.<<<<_.______>>>>.>,,........................,...,<<<.<<_.___.__.._>>>>.>.))))))).: +#=GC RF GgagauaU.A.GCucAgU...GGU...AgaGCg.u.cgGaC.UuaaAAuCcg.aag........................g...cgcg.GGU.UCg.Aa..UCCcg.c.uaucucC.a +// +# STOCKHOLM 1.0 + +#=GF AC RF00006 +#=GF ID Vault +#=GF DE Vault RNA +#=GF AU Bateman A; 0000-0002-6982-4660 +#=GF AU Gardner PP; 0000-0002-7808-1213 +#=GF SE Published; PMID:19491402 +#=GF SS Published; PMID:19491402 +#=GF GA 34.00 +#=GF TC 34.10 +#=GF NC 33.90 +#=GF TP Gene; +#=GF BM cmbuild -F CM SEED +#=GF CB cmcalibrate --mpi CM +#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -E 1000 -Z 549862.597050 CM SEQDB +#=GF DR URL; http://vaults.arc.ucla.edu/sci/sci_home.htm; +#=GF DR SO; 0000404; vault_RNA; +#=GF RN [1] +#=GF RM 19491402 +#=GF RT Evolution of Vault RNAs. +#=GF RA Stadler PF, Chen JJ, Hackermueller J, Hoffmann S, Horn F, Khaitovich P, +#=GF RA Kretzschmar AK, Mosig A, Prohaska SJ, Qi X, Schutt K, Ullmann K +#=GF RL Mol Biol Evol. 2009;27:1-17. +#=GF CC This family of RNAs are found as part of the enigmatic vault +#=GF CC ribonucleoprotein complex. The complex consists of a major vault protein +#=GF CC (MVP), two minor vault proteins (VPARP and TEP1), and several small +#=GF CC untranslated RNA molecules. It has been suggested that the vault complex +#=GF CC is involved in drug resistance. We have identified a putative novel vault +#=GF CC RNA on chromosome 5 EMBL:AC005219. +#=GF WK Vault_RNA +#=GF SQ 73 + + +AF045145.1/1-88 -GGCUGGC-UUUAGCUC-AGCGGUUACUUCGCGUGUCAUCAAACCACCUCUCU-------------------------------------------------------GGGUUGUUCGAGAC-----------------CCGCGGGCGCUCUCCAGCCCUCUU +AADA01028285.1/4499-4587 GGGCUGGC-UUUAGCUC-AGCGGUUACUUCGCGUGUCAUCAAACCACCUCUCU-------------------------------------------------------GGGUGGUUCGAGAC-----------------CCGUGGGCGCUCUCCAUCUCUCUU +AANU01185798.1/9761-9673 GGGCUGGC-UUUAGCUC-AGCGGUUACUUCGCAGUUCAGCAAACCACCUCUCU-------------------------------------------------------GGGUUGUUCGAGAC-----------------CCGCGGGCACUCUCCAGCCCUUUU +AAGJ04060733.1/252-131 UGGCUGGC--UAAGCAG-UGUGGAUACUUCGUAUAGCUAAAUGGAAUAGCACUUACUAAUCACAUGAG------------------------------------UAUUUGUGGGUUCGAUCCCCACUAC--GUCUCUUUCUACAUGUGCGUUCCAGCUUUUUU +#=GC SS_cons :<<<<<<-.----<<<<.-<<<<----<<<<<________________________________...........................................________>>>>>--.................>>>>>>>>----->>>>>>::::: +#=GC RF GgGccGGC.UUUAGCuc.AGcGGUUACuUCgacuauuuuaauuuuauuuaucuuauuuuuuuuu...........................................uuguuGGUucGAgaC.................CCgCggGCGCUcUCCggCccUUUU +// diff --git a/src/python/test/xrefs/parsers/flatfiles/rgd.txt b/src/python/test/xrefs/parsers/flatfiles/rgd.txt new file mode 100644 index 000000000..f0da555bf --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/rgd.txt @@ -0,0 +1,98 @@ +# RGD-PIPELINE: ftp-file-extracts +# MODULE: genes build 2024-03-11 +# GENERATED-ON: 2024/05/17 +# PURPOSE: information about active Rat genes extracted from RGD database +# SPECIES: Rattus norvegicus (Norway rat) NCBI:txid10116 +# CONTACT: rgd.data@mcw.edu +# FORMAT: tab delimited text +# NOTES: multiple values in a single column are separated by ';' +# +### Apr 1, 2011 RATMAP_IDs and RHDB_IDs are discontinued. +### Apr 15, 2011 GENE_REFSEQ_STATUS column is provided. +### Jul 1, 2011 fixed generation of CURATED_REF_PUBMED_IDs and UNCURATED_PUBMED_IDs +### Nov 23, 2011 no format changes (UniGene Ids are extracted from db in different way) +### Dec 19, 2011 fixed documentation in header to be consistent with column names +### Jul 6, 2012 added generation of file GENES_RAT_5.0 +### Oct 23, 2012 obsoleted column 23 'UNCURATED_REF_MEDLINE_ID' - changed to '(UNUSED)' +### Aug 19, 2013 gene descriptions made consistent with gene report pages from RGD website +### Oct 2, 2014 genes files refactored: +### GENES_RAT_5.0.txt and GENES_RAT_6.0.txt retired -- added new columns to GENES_RAT.txt to accommodate positions for Rnor_5.0 and Rnor_6.0. +### May 25, 2017 GENE_REFSEQ_STATUS is now published in column 23 for all species +### during transition period, for rat, mouse and human, GENE_REFSEQ_STATUS will continue to be also published in columns 39, 41 and 42 respectively +### Nov 1, 2018 renamed columns: SSLP_RGD_ID => MARKER_RGD_ID, SSLP_SYMBOL => MARKER_SYMBOL +### Jun 17 2019 data sorted by RGD ID; files exported into species specific directories +### Mar 11 2020 added Ensembl map positions +### Jan 18 2021 discontinued column 27 UNIGENE ID +### Feb 12 2021 added export of positions on assembly mRatBN7.2; discontinued export of positions on assembly RGSCv3.1 (columns 6,12,13,14) +### Jan 25 2022 rat Ensembl positions exported for mRatBN7.2 assembly +### Apr 18 2022 added export of canonical proteins in column 27 +### Mar 10 2023 no more 'protein_coding' gene types: 'protein-coding' used instead +### Mar 11 2024 added export of positions on assembly GRCr8 +# +#COLUMN INFORMATION: +# (First 38 columns are in common between all species) +# +#1 GENE_RGD_ID the RGD_ID of the gene +#2 SYMBOL official gene symbol +#3 NAME gene name +#4 GENE_DESC gene description (if available) +#5 CHROMOSOME_CELERA chromosome for Celera assembly +#6 CHROMOSOME_mRatBN7.2 chromosome for reference assembly mRatBN7.2 +#7 CHROMOSOME_RGSC_v3.4 chromosome for reference assembly RGSC_v3.4 +#8 FISH_BAND fish band information +#9 START_POS_CELERA start position for Celera assembly +#10 STOP_POS_CELERA stop position for Celera assembly +#11 STRAND_CELERA strand information for Celera assembly +#12 START_POS_mRatBN7.2 start position for reference assembly mRatBN7.2 +#13 STOP_POS_mRatBN7.2 stop position for reference assembly mRatBN7.2 +#14 STRAND_mRatBN7.2 strand information for reference assembly mRatBN7.2 +#15 START_POS_RGSC_v3.4 start position for reference assembly RGSC_v3.4 +#16 STOP_POS_RGSC_v3.4 stop position for reference assembly RGSC_v3.4 +#17 STRAND_RGSC_v3.4 strand information for reference assembly RGSC_v3.4 +#18 CURATED_REF_RGD_ID RGD_ID of paper(s) used to curate gene +#19 CURATED_REF_PUBMED_ID PUBMED_ID of paper(s) used to curate gene +#20 UNCURATED_PUBMED_ID PUBMED ids of papers associated with the gene at NCBI but not used for curation +#21 NCBI_GENE_ID NCBI Gene ID +#22 UNIPROT_ID UniProtKB id(s) +#23 GENE_REFSEQ_STATUS gene RefSeq Status (from NCBI) +#24 GENBANK_NUCLEOTIDE GenBank Nucleotide ID(s) +#25 TIGR_ID TIGR ID(s) +#26 GENBANK_PROTEIN GenBank Protein ID(s) +#27 CANONICAL_PROTEIN UniProt canonical protein(s) +#28 MARKER_RGD_ID RGD_ID(s) of markers associated with given gene +#29 MARKER_SYMBOL marker symbol +#30 OLD_SYMBOL old symbol alias(es) +#31 OLD_NAME old name alias(es) +#32 QTL_RGD_ID RGD_ID(s) of QTLs associated with given gene +#33 QTL_SYMBOL QTL symbol +#34 NOMENCLATURE_STATUS nomenclature status +#35 SPLICE_RGD_ID RGD_IDs for gene splices +#36 SPLICE_SYMBOL symbol for gene +#37 GENE_TYPE gene type +#38 ENSEMBL_ID Ensembl Gene ID +#39 (UNUSED) blank +#40 CHROMOSOME_Rnor_5.0 chromosome for Rnor_5.0 reference assembly +#41 START_POS_Rnor_5.0 start position for Rnor_5.0 reference assembly +#42 STOP_POS_Rnor_5.0 stop position for Rnor_5.0 reference assembly +#43 STRAND_Rnor_5.0 strand information for Rnor_5.0 reference assembly +#44 CHROMOSOME_Rnor_6.0 chromosome for Rnor_6.0 reference assembly +#45 START_POS_Rnor_6.0 start position for Rnor_6.0 reference assembly +#46 STOP_POS_Rnor_6.0 stop position for Rnor_6.0 reference assembly +#47 STRAND_Rnor_6.0 strand information for Rnor_6.0 reference assembly +#48 CHROMOSOME_ENSEMBL chromosome for mRatBN7.2 Ensembl assembly +#49 START_POS_ENSEMBL start position for mRatBN7.2 Ensembl assembly +#50 STOP_POS_ENSEMBL stop position for mRatBN7.2 Ensembl assembly +#51 STRAND_ENSEMBL strand information for mRatBN7.2 Ensembl assembly +#52 CHROMOSOME_GRCr8 chromosome for GRCr8 NCBI assembly +#53 START_POS_GRCr8 start position for GRCr8 NCBI assembly +#54 STOP_POS_GRCr8 stop position for GRCr8 NCBI assembly +#55 STRAND_GRCr8 strand information for GRCr8 NCBI assembly +# +GENE_RGD_ID SYMBOL NAME GENE_DESC CHROMOSOME_CELERA CHROMOSOME_mRatBN7.2 CHROMOSOME_RGSC_v3.4 FISH_BAND START_POS_CELERA STOP_POS_CELERA STRAND_CELERA START_POS_mRatBN7.2 STOP_POS_mRatBN7.2 STRAND_mRatBN7.2 START_POS_RGSC_v3.4 STOP_POS_RGSC_v3.4 STRAND_RGSC_v3.4 CURATED_REF_RGD_ID CURATED_REF_PUBMED_ID UNCURATED_PUBMED_ID NCBI_GENE_ID UNIPROT_ID GENE_REFSEQ_STATUS GENBANK_NUCLEOTIDE TIGR_ID GENBANK_PROTEIN CANONICAL_PROTEIN MARKER_RGD_ID MARKER_SYMBOL OLD_SYMBOL OLD_NAME QTL_RGD_ID QTL_SYMBOL NOMENCLATURE_STATUS SPLICE_RGD_ID SPLICE_SYMBOL GENE_TYPE ENSEMBL_ID (UNUSED) CHROMOSOME_Rnor_5.0 START_POS_Rnor_5.0 STOP_POS_Rnor_5.0 STRAND_Rnor_5.0 CHROMOSOME_Rnor_6.0 START_POS_Rnor_6.0 STOP_POS_Rnor_6.0 STRAND_Rnor_6.0 CHROMOSOME_ENSEMBL START_POS_ENSEMBL STOP_POS_ENSEMBL STRAND_ENSEMBL CHROMOSOME_GRCr8 START_POS_GRCr8 STOP_POS_GRCr8 STRAND_GRCr8 +2003 Asip agouti signaling protein 3 3 3 q41 142203571 142291520 + 143473584 143561170 + 145445175 145536831 + 68690;70068;1625724;1598407;1580655;1600115;1580654;2313999;2314006;1357925;6480464;6484113;8554872;13792537 10426381;11353396;14633851;15189116;21873635;7987393 12177191;12601169;17247639;17873059;19534427;21949658;29219041;7665913;9454589;9548375 24152 A0A8I6A2G1;F1LQS7;Q99JA2 VALIDATED AB045587;NM_052979 TC209185 BAB21564;BAB21579;EDL85941;EDL85942;EDL85943;NP_443211;Q99JA2 Q99JA2 5081260;5501161 PMC151376P1;RH142058 A;ASP agouti;agouti (coat color);agouti switch protein;agouti-signaling protein 70199 Coreg1 APPROVED protein-coding ENSRNOG00000017701 3 156860395 156949277 + 3 150492010 150579870 + 3 143555696 143561171 + 3 163933768 164021377 + +2004 A2m alpha-2-macroglobulin 4 4 4 q42 143730195 143781190 + 154897770 154947787 + 158103711 158153423 + 70068;70249;67925;619610;704363;704364;1298539;1298570;1549857;1549856;1300048;1598506;1598509;1598510;1302534;1300321;1598710;1598511;1598512;1598513;1331525;1300322;1358261;1358260;1580654;1580655;1600115;2298922;1598407;2298948;6480464;6484113;6907045;7240710;7411612;7401223;8554872;10046031;10046042;10046045;10046010;10046012;10046014;10046021;10046023;10046029;10046030;10046033;10046034;10046036;10046046;10046016;10046018;10046028;10046044;10046015;10046032;10046041;13702087;6892692;13792537;6892693;38500238;1578409 10319853;10848441;10936700;11498265;11779202;11813239;11839752;11952820;12042906;12125811;12133586;12221929;12494268;12809600;12966032;14675603;14960360;15118671;15167684;15509519;16177542;16538883;1710603;17722867;18177927;19240864;20005173;20579363;21478484;21742475;21873635;22434847;2424486;2432068;2436819;2442306;2448189;2450021;2460123;2468362;2475424;2479532;2581948;28266892;32747830;6163339;6202298;9446838;9453001;94834;9697696;9843780 10880251;11435418;12223092;12477932;12538697;15226301;15272003;15489334;17071617;1725450;17487688;17565389;18485748;18701465;19796622;20458337;20848291;21188621;21362503;21642630;21669904;22516433;23376485;23533145;2414291;2466233;2473946;26746007;26895739;27301375;29476059;36894970;9398211;9714181 24153 A0A8L2QY59;A6ILD0;A6ILD1;A6ILD2;P06238;Q4FZY3 PROVISIONAL AH002120;AH002202;AH003208 TC229016;TC239648 AAA40636;AAA40637;AAA40638;AAA41595;AAA77658;AAH98922;AAW65786;AAX11376;AAX12488;CAA32164;EDM02007;EDM02008;EDM02009;NP_036620;P06238 P06238 10048;10049;42147 D4Arb15;D4Mit20;D4Wox16 A2MAC1;A2m1 alpha-2-M;alpha-2-macroglobulin-P;alpha-2-macroglobulin-like 6903353;724558 Bp353;Plsm2 PROVISIONAL protein-coding ENSRNOG00000028896;ENSRNOG00000045772 4 221393233 221442945 + 4 154309426 154359138 + 4 154897877 154947786 + 4 156570163 156619870 + + Aanat aralkylamine N-acetyltransferase 10 10 10 q32.2 100399613 100403925 + 101827072 101831805 + 106709371 106713683 + 70068;70285;67926;619610;632679;628397;1298610;1298540;1298611;1298603;704409;1300232;1580655;1600115;1300048;2302130;2301030;2301033;2301039;2301034;2301036;2301032;2301043;2312676;2301038;2301031;2301041;2301035;2301037;6480464;6907045;7240710;10402751;8553854;13792537 10451024;11125071;11427721;11516836;11854096;12358739;12736803;16024134;16166080;16282194;16441550;16805813;17014691;17164235;17198543;18001324;18048060;18321474;18624957;21873635;6268470;7502081;7592994;8524412;8770929;9054387 11313340;14617573;15046865;15193530;15228600;15798208;16099857;16687309;17363136;17364576;17403780;20210853;21437622;22908386;23080076;23513468;24877634;25594545;27339900;28502584;30890428;31124080;37256589;7545952 25120 A6HKX7;Q4JL74;Q64553;Q64666 VALIDATED AC123144;CH473948;DQ075321;JAXUCZ010000010;NM_052979 TC222688 AAA92711;AAB38484;AAC52330;AAY86767;EDM06682;NP_036950;Q64666;XP_006247854 Q64666 1626975;1630499;5028123 Aanat;D10Wox52;D11Mit102 AA-NAT;Nat4 Arylalkylamine N - acetyltransferase (Serotonin N - acetyltransferase);arylakylamine N-acetyltransferase;arylalkylamine N - acetyltransferase ;arylalkylamine N-acetyltransferase;seretonin N-acetyltransferase;serotonin N-acetyltransferase;serotonin acetylase APPROVED protein-coding 10 105231006 105235322 + 10 105568091 105572407 + 10 101827301 101831801 + 10 102323647 102330639 + +2007 Abcd3 ATP binding cassette subfamily D member 3 2 2 2 q42 202282471 202317962 - 209852087 209905763 - 218396071 218432172 - 70068;619610;631711;704362;1358265;1580655;1300330;1598654;1598656;1598657;1598658;704409;1600115;1580654;6480464;7240710;8554872;1580664;8553510;8554507;13792537 10366717;11125071;11341945;11883951;12176987;1301993;14561759;15060019;19010322;1968461;21873635;7528830;9108325 10527525;10704444;11248239;11453642;12865426;12915479;14651853;16344115;17542813;17609205;18178290;18614015;18992293;19479899;19686593;19946888;20007743;21460186;21502359;21525035;22871113;25168382;31505169;9425230;9765053;9922452 25270 A0A8I5ZN14;A0A8I6A495;A0A8I6ANP9;A6HVF3;A6HVF4;A6HVF5;P16970 VALIDATED XM_039101772;XM_039101774;XM_063281326 TC229511 BAA14086;EDL82089;EDL82090;EDL82091;NP_036936;P16970;XP_038957700;XP_038957702;XP_063137396 P16970 5025528;5051803;67314 D2Arb23;RH128671;RH94667 PMP70;PMP70, 70-kDa peroxisomal membrane protein 70 kDa peroxisomal membrane protein;70-kDa peroxisomal membrane protein;ATP-binding cassette sub-family D member 3;ATP-binding cassette, sub-family D (ALD), member 3;ATP-binding cassette, subfamily D (ALD), member 3;Peroxisomal membrane protein 1 APPROVED protein-coding 2 243374189 243409604 - 2 225335708 225389120 - 2 209852087 209906020 - 2 212536791 212590379 - +2011 ENSRNOG00000012966 acyl-CoA dehydrogenase, long chain 9 9 9 q32 65813263 65851320 - 68333981 68372149 - 65613130 65651775 - 70068;619610;631718;631739;704362;737633;1600115;704409;1300048;1580654;1580655;2317589;2317678;6480464;6907045;8554872;10402751;8553446;13673745;13792537 11125071;12477932;14728676;15060019;21873635;2777793;3813556;3968063;8660691;9802886 14651853;15489334;15639194;18614015;21151927;23106098;26316108;26767982;8268228;9861014 25287 A0A8I6GMH0;A6KFD6;A6KFD7;P15650 PROVISIONAL BC062006;CH474044;FQ215575;FQ218275;J05029;JAXUCZ010000009;L11276;NM_012819;XM_063266668 TC203790 AAA40668;AAA41514;AAH62006;EDL75290;EDL75291;NP_036951;P15650;XP_063122738 P15650 5029849;5052821;5506717 ACADL;AW530440;RH142293 LCAD ACOADA;Acyl Coenzyme A dehydrogenase long chain;Acyl Coenzyme A dehydrogenase, long chain;LCAD long chain acyl-CoA dehydrogenase;LCAD, long chain acyl-CoA dehydrogenase;acetyl-Coenzyme A dehydrogenase, long-chain;acyl-Coenzyme A dehydrogenase, long-chain;long-chain acyl-CoA dehydrogenase;long-chain specific acyl-CoA dehydrogenase, mitochondrial APPROVED protein-coding ENSRNOG00000012966 9 73434371 73472895 + 9 73833368 73871857 - 9 68333980 68372220 - 9 75783689 75822077 - +2012 Acadm acyl-CoA dehydrogenase medium chain 2 2 2 q45 234791302 234815446 - 242858865 242883036 - 251866645 251890729 - 70068;619610;70860;631718;631724;631739;704362;1358266;704409;1600115;1598685;1598687;1598688;1598689;1598690;1598691;1300334;1300048;1580655;1580654;2317589;2317678;6480464;6484113;6907045;7240710;8554872;10402751;10047124;8553446;13792537 10958805;11125071;11306811;14728676;15060019;15358373;15850406;15852996;15863369;21873635;23076603;2777793;3611054;3813556;3968063;734877;8615829;8660691;9164869 14651853;16020546;16121256;16972171;18061544;18459129;18614015;1902818;19224950;19428797;19703432;1970566;2029527;21084676;21237683;21630459;23376485;2393404;25416781;26316108;26767982;32227582;3597357 24158 A0A8I5Y8D9;A0A8I5ZQ05;A6HWP6;G3V796;P08503 VALIDATED BP502473;CH473952;CK359511;FQ214755;J02791;JAXUCZ010000002;NM_016986 TC216640 AAA40670;EDL82532;NP_058682;P08503 P08503 5028777;5035562;5048878;5075084 ACADM;RH133158;RH138389;RH142291 Acyl-Coenzyme A dehydrogenase C-4 to C-12 straight-chain;Acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight-chain;acetyl-Coenzyme A dehydrogenase, medium chain;acyl-CoA dehydrogenase, C-4 to C-12 straight chain;acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight chain;acyl-Coenzyme A dehydrogenase, medium chain;medium-chain acyl-CoA dehydrogenase;medium-chain specific acyl-CoA dehydrogenase, mitochondrial APPROVED protein-coding ENSRNOG00000009845;ENSRNOG00055028387 2 278788485 278812656 - 2 260124418 260148589 - 2 242858865 242883147 - 2 245518693 245542864 - +2013 Acadsb acyl-CoA dehydrogenase, short/branched chain 1 1 1 q41 183943792 183982502 + 186188939 186227796 + 190987657 191026275 + 619610;631739;1298221;1358267;704409;1600115;1300336;1300048;1580654;1580655;6480464;6907045;7240710;8554872;10402751;13792537 11125071;12855692;21873635;631739;734879;8660691 10832746;11013134;14651853;18614015;23376485;23474214 25618 A0A0A0MY00;A0A8I6G5Q8;A0A8I6GLN2;A6HWW9;A6HWX0;P70584 PROVISIONAL AAB17136;EDM11700;EDM11701;NP_037216;P70584;XP_008758088;XP_063138171 P70584 5057173 D1Bda38 2-MEBCAD;LOC103691247;SBCAD 2-methyl branched chain acyl-CoA dehydrogenase;2-methylbutyryl-CoA dehydrogenase;2-methylbutyryl-coenzyme A dehydrogenase;Acyl-Coenzyme A dehydrogenase short-branched chain;Acyl-Coenzyme A dehydrogenase, short-branched chain;acyl-Coenzyme A dehydrogenase, short/branched chain;short/branched chain specific acyl-CoA dehydrogenase, mitochondrial;uncharacterized LOC103691247 PROVISIONAL protein-coding 1 209013684 209048775 + 1 201981362 202022771 + 1 186188987 186230379 + 1 195619088 195660564 + diff --git a/src/python/test/xrefs/parsers/flatfiles/ucsc.txt b/src/python/test/xrefs/parsers/flatfiles/ucsc.txt new file mode 100644 index 000000000..d00e5e2c5 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/ucsc.txt @@ -0,0 +1,10 @@ +ENST00000619216.1 chr1 - 17368 17436 17368 17368 1 17368, 17436, uc031tla.1 +ENST00000473358.1 chr1 + 29553 31097 29553 29553 3 29553,30563,30975, 30039,30667,31097, uc057aty.1 +ENST00000469289.1 chr1 + 30266 31109 30266 30266 2 30266,30975, 30667,31109, uc057atz.1 +ENST00000607096.1 chr1 + 30365 30503 30365 30365 1 30365, 30503, uc031tlb.1 +ENST00000417324.1 chr1 - 34553 36081 34553 34553 3 34553,35276,35720, 35174,35481,36081, uc001aak.4 +ENST00000461467.1 chr1 - 35244 36073 35244 35244 2 35244,35720, 35481,36073, uc057aua.1 +ENST00000641515.2 chr1 + 65418 71585 65564 70008 3 65418,65519,69036, 65433,65573,71585, A0A2U3U0J3 uc001aal.2 +ENST00000335137.4 chr1 + 69054 70108 69090 70008 1 69054, 70108, Q8NH21 uc285fxb.1 +ENST00000466430.5 chr1 - 89294 120932 89294 89294 4 89294,92090,112699,120774, 91629,92240,112804,120932, uc057aub.1 +ENST00000495576.1 chr1 - 89550 91105 89550 89550 2 89550,90286, 90050,91105, uc057auc.1 diff --git a/src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt b/src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt new file mode 100644 index 000000000..3b34a92b6 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt @@ -0,0 +1,3 @@ +UniProt Knowledgebase Release 2024_03 consists of: +UniProtKB/Swiss-Prot Release 2024_03 of 29-May-2024 +UniProtKB/TrEMBL Release 2024_03 of 29-May-2024 diff --git a/src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt b/src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt new file mode 100644 index 000000000..fda4ed35d --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt @@ -0,0 +1,591 @@ +ID 1433B_HUMAN Reviewed; 246 AA. +AC P31946; A8K9K2; E1P616; +DT 01-JUL-1993, integrated into UniProtKB/Swiss-Prot. +DT 23-JAN-2007, sequence version 3. +DT 29-MAY-2024, entry version 248. +DE RecName: Full=14-3-3 protein beta/alpha; +DE AltName: Full=Protein 1054; +DE AltName: Full=Protein kinase C inhibitor protein 1; +DE Short=KCIP-1; +DE Contains: +DE RecName: Full=14-3-3 protein beta/alpha, N-terminally processed; +GN Name=YWHAB; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606; +DR EMBL; X57346; CAA40621.1; -; mRNA. +DR EMBL; AK292717; BAF85406.1; -; mRNA. +DR EMBL; AL008725; -; NOT_ANNOTATED_CDS; Genomic_DNA. +DR EMBL; CH471077; EAW75893.1; -; Genomic_DNA. +DR EMBL; CH471077; EAW75894.1; -; Genomic_DNA. +DR EMBL; CH471077; EAW75896.1; -; Genomic_DNA. +DR EMBL; BC001359; AAH01359.1; -; mRNA. +DR PIR; S34755; S34755. +DR RefSeq; NP_003395.1; NM_003404.4. [P31946-1] +DR RefSeq; NP_647539.1; NM_139323.3. [P31946-1] +DR RefSeq; XP_016883528.1; XM_017028039.1. +DR PDB; 2BQ0; X-ray; 2.50 A; A/B=2-239. +DR PDB; 2C23; X-ray; 2.65 A; A=2-239. +DR PDB; 4DNK; X-ray; 2.20 A; A/B=1-246. +DR PDB; 5N10; X-ray; 1.60 A; A/B=1-246. +DR PDB; 6A5Q; X-ray; 2.00 A; A/B/C=1-246. +DR PDB; 6BYK; X-ray; 3.00 A; A/B/C/D=3-232. +DR PDB; 6GN0; X-ray; 3.24 A; A/B/C/D=1-239. +DR PDB; 6GN8; X-ray; 2.34 A; A/B=1-234. +DR PDB; 6GNJ; X-ray; 3.24 A; A/B=1-234. +DR PDB; 6GNK; X-ray; 2.55 A; A/B=1-234. +DR PDB; 6GNN; X-ray; 3.79 A; A=1-239. +DR PDB; 6HEP; X-ray; 1.86 A; A/B/C/D=1-232. +DR PDB; 8DP5; EM; 3.10 A; C=1-246. +DR PDB; 8EQ8; X-ray; 1.50 A; A/B=1-239. +DR PDB; 8EQH; X-ray; 1.90 A; A/B=1-239. +DR PDBsum; 2BQ0; -. +DR PDBsum; 2C23; -. +DR PDBsum; 4DNK; -. +DR PDBsum; 5N10; -. +DR PDBsum; 6A5Q; -. +DR PDBsum; 6BYK; -. +DR PDBsum; 6GN0; -. +DR PDBsum; 6GN8; -. +DR PDBsum; 6GNJ; -. +DR PDBsum; 6GNK; -. +DR PDBsum; 6GNN; -. +DR PDBsum; 6HEP; -. +DR PDBsum; 8DP5; -. +DR PDBsum; 8EQ8; -. +DR PDBsum; 8EQH; -. +DR AlphaFoldDB; P31946; -. +DR EMDB; EMD-27630; -. +DR SASBDB; P31946; -. +DR SMR; P31946; -. +DR BioGRID; 113361; 1082. +DR CORUM; P31946; -. +DR DIP; DIP-743N; -. +DR ELM; P31946; -. +DR IntAct; P31946; 656. +DR MINT; P31946; -. +DR STRING; 9606.ENSP00000361930; -. +DR BindingDB; P31946; -. +DR ChEMBL; CHEMBL3710403; -. +DR DrugBank; DB09130; Copper. +DR DrugBank; DB12695; Phenethyl Isothiocyanate. +DR GlyGen; P31946; 1 site, 1 O-linked glycan (1 site). +DR iPTMnet; P31946; -. +DR MetOSite; P31946; -. +DR PhosphoSitePlus; P31946; -. +DR SwissPalm; P31946; -. +DR BioMuta; YWHAB; -. +DR DMDM; 1345590; -. +DR OGP; P31946; -. +DR REPRODUCTION-2DPAGE; IPI00216318; -. +DR CPTAC; CPTAC-142; -. +DR jPOST; P31946; -. +DR MassIVE; P31946; -. +DR MaxQB; P31946; -. +DR PaxDb; 9606-ENSP00000361930; -. +DR PeptideAtlas; P31946; -. +DR PRIDE; P31946; -. +DR ProteomicsDB; 54816; -. +DR ProteomicsDB; 54817; -. [P31946-2] +DR Pumba; P31946; -. +DR TopDownProteomics; P31946-1; -. [P31946-1] +DR TopDownProteomics; P31946-2; -. [P31946-2] +DR Antibodypedia; 1906; 847 antibodies from 46 providers. +DR CPTC; P31946; 3 antibodies. +DR DNASU; 7529; -. +DR Ensembl; ENST00000353703.9; ENSP00000300161.4; ENSG00000166913.13. [P31946-1] +DR Ensembl; ENST00000372839.7; ENSP00000361930.3; ENSG00000166913.13. [P31946-1] +DR GeneID; 7529; -. +DR KEGG; hsa:7529; -. +DR MANE-Select; ENST00000353703.9; ENSP00000300161.4; NM_139323.4; NP_647539.1. +DR AGR; HGNC:12849; -. +DR CTD; 7529; -. +DR DisGeNET; 7529; -. +DR GeneCards; YWHAB; -. +DR HPA; ENSG00000166913; Low tissue specificity. +DR neXtProt; NX_P31946; -. +DR OpenTargets; ENSG00000166913; -. +DR PharmGKB; PA37438; -. +DR VEuPathDB; HostDB:ENSG00000166913; -. +DR eggNOG; KOG0841; Eukaryota. +DR GeneTree; ENSGT01090000260040; -. +DR HOGENOM; CLU_058290_1_0_1; -. +DR InParanoid; P31946; -. +DR OMA; EQHVTII; -. +DR OrthoDB; 920089at2759; -. +DR PhylomeDB; P31946; -. +DR TreeFam; TF102003; -. +DR PathwayCommons; P31946; -. +DR SignaLink; P31946; -. +DR SIGNOR; P31946; -. +DR BioGRID-ORCS; 7529; 19 hits in 1156 CRISPR screens. +DR ChiTaRS; YWHAB; human. +DR EvolutionaryTrace; P31946; -. +DR GeneWiki; YWHAB; -. +DR Pharos; P31946; Tbio. +DR PRO; PR:P31946; -. +DR Proteomes; UP000005640; Chromosome 20. +DR RNAct; P31946; Protein. +DR Bgee; ENSG00000166913; Expressed in endothelial cell and 214 other cell types or tissues. +DR ExpressionAtlas; P31946; baseline and differential. +DR CDD; cd10022; 14-3-3_beta_zeta; 1. +DR Gene3D; 1.20.190.20; 14-3-3 domain; 1. +DR IDEAL; IID00038; -. +DR InterPro; IPR000308; 14-3-3. +DR InterPro; IPR023409; 14-3-3_CS. +DR InterPro; IPR036815; 14-3-3_dom_sf. +DR InterPro; IPR023410; 14-3-3_domain. +DR PANTHER; PTHR18860; 14-3-3 PROTEIN; 1. +DR PANTHER; PTHR18860:SF28; 14-3-3 PROTEIN BETA_ALPHA; 1. +DR Pfam; PF00244; 14-3-3; 1. +DR PIRSF; PIRSF000868; 14-3-3; 1. +DR PRINTS; PR00305; 1433ZETA. +DR SMART; SM00101; 14_3_3; 1. +DR SUPFAM; SSF48445; 14-3-3 protein; 1. +DR PROSITE; PS00796; 1433_1; 1. +DR PROSITE; PS00797; 1433_2; 1. +PE 1: Evidence at protein level; +KW 3D-structure; Acetylation; Alternative initiation; Cytoplasm; +KW Direct protein sequencing; Host-virus interaction; Isopeptide bond; +KW Membrane; Nitration; Phosphoprotein; Reference proteome; Ubl conjugation; +KW Vacuole. +SQ SEQUENCE 246 AA; 28082 MW; 6BE1A9BF97468017 CRC64; + MTMDKSELVQ KAKLAEQAER YDDMAAAMKA VTEQGHELSN EERNLLSVAY KNVVGARRSS + WRVISSIEQK TERNEKKQQM GKEYREKIEA ELQDICNDVL ELLDKYLIPN ATQPESKVFY + LKMKGDYFRY LSEVASGDNK QTTVSNSQQA YQEAFEISKK EMQPTHPIRL GLALNFSVFY + YEILNSPEKA CSLAKTAFDE AIAELDTLNE ESYKDSTLIM QLLRDNLTLW TSENQGDEGD + AGEGEN +// +ID 1433E_HUMAN Reviewed; 255 AA. +AC P62258; B3KY71; D3DTH5; P29360; P42655; Q4VJB6; Q53XZ5; Q63631; Q7M4R4; +DT 05-JUL-2004, integrated into UniProtKB/Swiss-Prot. +DT 05-JUL-2004, sequence version 1. +DT 29-MAY-2024, entry version 207. +DE RecName: Full=14-3-3 protein epsilon; +DE Short=14-3-3E; +DE EC=2.1.1.148 {ECO:0000256|HAMAP-Rule:MF_01408}; +GN Name=YWHAE; Synonyms=YWHAE1; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606; +DR EMBL; U20972; AAC50175.1; -; mRNA. +DR EMBL; U54778; AAC50710.1; -; mRNA. +DR EMBL; U43399; AAC50625.1; -; mRNA. +DR EMBL; U43430; AAD00026.1; -; mRNA. +DR EMBL; U28936; AAA75301.1; -; mRNA. +DR EMBL; AB017103; BAA32538.1; -; Genomic_DNA. +DR EMBL; AY883089; AAX68683.1; -; mRNA. +DR EMBL; AK128785; BAG54733.1; -; mRNA. +DR EMBL; AK295260; BAG58249.1; -; mRNA. +DR EMBL; AK316185; BAH14556.1; -; mRNA. +DR EMBL; BT007161; AAP35825.1; -; mRNA. +DR EMBL; CH471108; EAW90628.1; -; Genomic_DNA. +DR EMBL; CH471108; EAW90629.1; -; Genomic_DNA. +DR EMBL; BC000179; AAH00179.1; -; mRNA. +DR EMBL; BC001440; AAH01440.1; -; mRNA. +DR PIR; A61235; A61235. +DR PIR; I38947; I38947. +DR RefSeq; NP_006752.1; NM_006761.4. [P62258-1] +DR PDB; 2BR9; X-ray; 1.75 A; A=1-233. +DR PDB; 3UAL; X-ray; 1.80 A; A=1-232. +DR PDB; 3UBW; X-ray; 1.90 A; A=1-234. +DR PDB; 6EIH; X-ray; 2.70 A; A=3-232. +DR PDB; 7C8E; X-ray; 3.16 A; A/B=1-232. +DR PDB; 7V9B; X-ray; 1.85 A; A=1-232. +DR PDB; 8DGM; X-ray; 3.20 A; A=1-255. +DR PDB; 8DGN; X-ray; 3.16 A; A=1-255. +DR PDB; 8DGP; X-ray; 2.70 A; A/B/C/D=1-255. +DR PDB; 8DP5; EM; 3.10 A; D=1-255. +DR PDB; 8Q1S; X-ray; 3.23 A; A/B=1-255. +DR PDBsum; 2BR9; -. +DR PDBsum; 3UAL; -. +DR PDBsum; 3UBW; -. +DR PDBsum; 6EIH; -. +DR PDBsum; 7C8E; -. +DR PDBsum; 7V9B; -. +DR PDBsum; 8DGM; -. +DR PDBsum; 8DGN; -. +DR PDBsum; 8DGP; -. +DR PDBsum; 8DP5; -. +DR PDBsum; 8Q1S; -. +DR AlphaFoldDB; P62258; -. +DR EMDB; EMD-27630; -. +DR SMR; P62258; -. +DR BioGRID; 113363; 1160. +DR CORUM; P62258; -. +DR DIP; DIP-36676N; -. +DR ELM; P62258; -. +DR IntAct; P62258; 679. +DR MINT; P62258; -. +DR STRING; 9606.ENSP00000264335; -. +DR ChEMBL; CHEMBL3329082; -. +DR DrugBank; DB01780; Fusicoccin. +DR DrugBank; DB12695; Phenethyl Isothiocyanate. +DR MoonDB; P62258; Predicted. +DR TCDB; 8.A.98.1.10; the 14-3-3 protein (14-3-3) family. +DR GlyGen; P62258; 1 site, 1 O-linked glycan (1 site). +DR iPTMnet; P62258; -. +DR MetOSite; P62258; -. +DR PhosphoSitePlus; P62258; -. +DR SwissPalm; P62258; -. +DR BioMuta; YWHAE; -. +DR DMDM; 51702210; -. +DR OGP; P42655; -. +DR jPOST; P62258; -. +DR MassIVE; P62258; -. +DR MaxQB; P62258; -. +DR PaxDb; 9606-ENSP00000264335; -. +DR PeptideAtlas; P62258; -. +DR PRIDE; P62258; -. +DR ProteomicsDB; 57377; -. +DR ProteomicsDB; 57378; -. [P62258-2] +DR Pumba; P62258; -. +DR TopDownProteomics; P62258-1; -. [P62258-1] +DR Antibodypedia; 1898; 612 antibodies from 40 providers. +DR CPTC; P62258; 3 antibodies. +DR DNASU; 7531; -. +DR Ensembl; ENST00000264335.13; ENSP00000264335.8; ENSG00000108953.17. [P62258-1] +DR Ensembl; ENST00000571732.5; ENSP00000461762.1; ENSG00000108953.17. [P62258-2] +DR Ensembl; ENST00000616643.3; ENSP00000481059.2; ENSG00000274474.3. [P62258-2] +DR Ensembl; ENST00000627231.2; ENSP00000487356.1; ENSG00000274474.3. [P62258-1] +DR GeneID; 7531; -. +DR KEGG; hsa:7531; -. +DR MANE-Select; ENST00000264335.13; ENSP00000264335.8; NM_006761.5; NP_006752.1. +DR AGR; HGNC:12851; -. +DR CTD; 7531; -. +DR DisGeNET; 7531; -. +DR GeneCards; YWHAE; -. +DR HPA; ENSG00000108953; Low tissue specificity. +DR MalaCards; YWHAE; -. +DR neXtProt; NX_P62258; -. +DR OpenTargets; ENSG00000108953; -. +DR PharmGKB; PA37440; -. +DR VEuPathDB; HostDB:ENSG00000108953; -. +DR eggNOG; KOG0841; Eukaryota. +DR GeneTree; ENSGT01110000267238; -. +DR HOGENOM; CLU_058290_0_0_1; -. +DR InParanoid; P62258; -. +DR OMA; KGCQLAR; -. +DR OrthoDB; 920089at2759; -. +DR PhylomeDB; P62258; -. +DR TreeFam; TF102003; -. +DR PathwayCommons; P62258; -. +DR SignaLink; P62258; -. +DR SIGNOR; P62258; -. +DR BioGRID-ORCS; 7531; 212 hits in 1128 CRISPR screens. +DR ChiTaRS; YWHAE; human. +DR EvolutionaryTrace; P62258; -. +DR GeneWiki; YWHAE; -. +DR Pharos; P62258; Tbio. +DR PRO; PR:P62258; -. +DR Proteomes; UP000005640; Chromosome 17. +DR RNAct; P62258; Protein. +DR Bgee; ENSG00000108953; Expressed in superior frontal gyrus and 116 other cell types or tissues. +DR ExpressionAtlas; P62258; baseline and differential. +DR CDD; cd10020; 14-3-3_epsilon; 1. +DR Gene3D; 1.20.190.20; 14-3-3 domain; 1. +DR IDEAL; IID00512; -. +DR InterPro; IPR000308; 14-3-3. +DR InterPro; IPR023409; 14-3-3_CS. +DR InterPro; IPR036815; 14-3-3_dom_sf. +DR InterPro; IPR023410; 14-3-3_domain. +DR PANTHER; PTHR18860; 14-3-3 PROTEIN; 1. +DR PANTHER; PTHR18860:SF17; 14-3-3 PROTEIN EPSILON; 1. +DR Pfam; PF00244; 14-3-3; 1. +DR PIRSF; PIRSF000868; 14-3-3; 1. +DR PRINTS; PR00305; 1433ZETA. +DR SMART; SM00101; 14_3_3; 1. +DR SUPFAM; SSF48445; 14-3-3 protein; 1. +DR PROSITE; PS00796; 1433_1; 1. +DR PROSITE; PS00797; 1433_2; 1. +PE 1: Evidence at protein level; +KW 3D-structure; Acetylation; Alternative splicing; Cytoplasm; +KW Direct protein sequencing; Host-virus interaction; Isopeptide bond; +KW Nucleus; Phosphoprotein; Reference proteome; Ubl conjugation. +SQ SEQUENCE 255 AA; 29174 MW; 07817CCBD1F75B26 CRC64; + MDDREDLVYQ AKLAEQAERY DEMVESMKKV AGMDVELTVE ERNLLSVAYK NVIGARRASW + RIISSIEQKE ENKGGEDKLK MIREYRQMVE TELKLICCDI LDVLDKHLIP AANTGESKVF + YYKMKGDYHR YLAEFATGND RKEAAENSLV AYKAASDIAM TELPPTHPIR LGLALNFSVF + YYEILNSPDR ACRLAKAAFD DAIAELDTLS EESYKDSTLI MQLLRDNLTL WTSDMQGDGE + EQNKEALQDV EDENQ +// +ID 1433F_HUMAN Reviewed; 246 AA. +AC Q04917; +DT 01-OCT-1993, integrated into UniProtKB/Swiss-Prot. +DT 23-JAN-2007, sequence version 4. +DT 29-MAY-2024, entry version 238. +DE RecName: Full=14-3-3 protein eta; +DE AltName: Full=Protein AS1; +GN Name=YWHAH; Synonyms=YWHA1; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606; +CC CAUTION: The sequence shown here is derived from an Ensembl +DR EMBL; L20422; AAA35483.1; -; mRNA. +DR EMBL; X80536; CAA56676.1; -; Genomic_DNA. +DR EMBL; X78138; CAA55017.1; -; mRNA. +DR EMBL; X57345; CAA40620.1; -; mRNA. +DR EMBL; D78577; BAA11418.1; -; Genomic_DNA. +DR EMBL; S80794; AAB36036.1; -; mRNA. +DR EMBL; CR456612; CAG30498.1; -; mRNA. +DR EMBL; Z82248; -; NOT_ANNOTATED_CDS; Genomic_DNA. +DR EMBL; BC003047; AAH03047.1; -; mRNA. +DR PIR; S34756; S34756. +DR PIR; S38509; S38509. +DR PIR; S38532; S38532. +DR RefSeq; NP_003396.1; NM_003405.3. +DR PDB; 2C63; X-ray; 2.15 A; A/B/C/D=2-246. +DR PDB; 2C74; X-ray; 2.70 A; A/B=2-246. +DR PDB; 7NMZ; X-ray; 2.30 A; AA/BA=1-234. +DR PDBsum; 2C63; -. +DR PDBsum; 2C74; -. +DR PDBsum; 7NMZ; -. +DR AlphaFoldDB; Q04917; -. +DR SMR; Q04917; -. +DR BioGRID; 113365; 1114. +DR CORUM; Q04917; -. +DR DIP; DIP-27566N; -. +DR ELM; Q04917; -. +DR IntAct; Q04917; 937. +DR MINT; Q04917; -. +DR STRING; 9606.ENSP00000248975; -. +DR BindingDB; Q04917; -. +DR ChEMBL; CHEMBL3708585; -. +DR DrugBank; DB12695; Phenethyl Isothiocyanate. +DR GlyGen; Q04917; 1 site, 1 O-linked glycan (1 site). +DR iPTMnet; Q04917; -. +DR MetOSite; Q04917; -. +DR PhosphoSitePlus; Q04917; -. +DR SwissPalm; Q04917; -. +DR BioMuta; YWHAH; -. +DR DMDM; 1345593; -. +DR jPOST; Q04917; -. +DR MassIVE; Q04917; -. +DR MaxQB; Q04917; -. +DR PaxDb; 9606-ENSP00000248975; -. +DR PeptideAtlas; Q04917; -. +DR ProteomicsDB; 58300; -. +DR Pumba; Q04917; -. +DR TopDownProteomics; Q04917; -. +DR Antibodypedia; 11204; 346 antibodies from 37 providers. +DR DNASU; 7533; -. +DR Ensembl; ENST00000248975.6; ENSP00000248975.5; ENSG00000128245.15. +DR GeneID; 7533; -. +DR KEGG; hsa:7533; -. +DR MANE-Select; ENST00000248975.6; ENSP00000248975.5; NM_003405.4; NP_003396.1. +DR AGR; HGNC:12853; -. +DR CTD; 7533; -. +DR DisGeNET; 7533; -. +DR GeneCards; YWHAH; -. +DR HPA; ENSG00000128245; Tissue enriched (brain). +DR neXtProt; NX_Q04917; -. +DR OpenTargets; ENSG00000128245; -. +DR PharmGKB; PA37442; -. +DR VEuPathDB; HostDB:ENSG00000128245; -. +DR eggNOG; KOG0841; Eukaryota. +DR GeneTree; ENSGT01090000260040; -. +DR HOGENOM; CLU_058290_0_0_1; -. +DR InParanoid; Q04917; -. +DR OMA; IEQKTMS; -. +DR OrthoDB; 920089at2759; -. +DR PhylomeDB; Q04917; -. +DR TreeFam; TF102003; -. +DR PathwayCommons; Q04917; -. +DR SignaLink; Q04917; -. +DR SIGNOR; Q04917; -. +DR BioGRID-ORCS; 7533; 15 hits in 1154 CRISPR screens. +DR ChiTaRS; YWHAH; human. +DR EvolutionaryTrace; Q04917; -. +DR GeneWiki; YWHAH; -. +DR Pharos; Q04917; Tbio. +DR PRO; PR:Q04917; -. +DR Proteomes; UP000005640; Chromosome 22. +DR RNAct; Q04917; Protein. +DR Bgee; ENSG00000128245; Expressed in frontal pole and 196 other cell types or tissues. +DR ExpressionAtlas; Q04917; baseline and differential. +DR CDD; cd10025; 14-3-3_eta; 1. +DR Gene3D; 1.20.190.20; 14-3-3 domain; 1. +DR InterPro; IPR000308; 14-3-3. +DR InterPro; IPR023409; 14-3-3_CS. +DR InterPro; IPR036815; 14-3-3_dom_sf. +DR InterPro; IPR023410; 14-3-3_domain. +DR PANTHER; PTHR18860; 14-3-3 PROTEIN; 1. +DR PANTHER; PTHR18860:SF16; 14-3-3 PROTEIN ETA; 1. +DR Pfam; PF00244; 14-3-3; 1. +DR PIRSF; PIRSF000868; 14-3-3; 1. +DR PRINTS; PR00305; 1433ZETA. +DR SMART; SM00101; 14_3_3; 1. +DR SUPFAM; SSF48445; 14-3-3 protein; 1. +DR PROSITE; PS00796; 1433_1; 1. +DR PROSITE; PS00797; 1433_2; 1. +PE 1: Evidence at protein level; +KW 3D-structure; Acetylation; Direct protein sequencing; Phosphoprotein; +KW Reference proteome. +SQ SEQUENCE 246 AA; 28219 MW; D70FBC100C45D6E5 CRC64; + MGDREQLLQR ARLAEQAERY DDMASAMKAV TELNEPLSNE DRNLLSVAYK NVVGARRSSW + EAGEGN +// +ID 1433G_HUMAN Reviewed; 247 AA. +AC P61981; O70457; P35214; Q6FH52; Q9UDP2; Q9UN99; +DT 07-JUN-2004, integrated into UniProtKB/Swiss-Prot. +DT 23-JAN-2007, sequence version 2. +DT 29-MAY-2024, entry version 197. +DE RecName: Full=14-3-3 protein gamma {ECO:0000305}; +DE AltName: Full=Protein kinase C inhibitor protein 1; +DE Short=KCIP-1; +DE Contains: +DE RecName: Full=14-3-3 protein gamma, N-terminally processed; +GN Name=YWHAG {ECO:0000312|HGNC:HGNC:12852}; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606; +DR EMBL; AF142498; AAD48408.1; -; mRNA. +DR EMBL; AB024334; BAA85184.1; -; mRNA. +DR EMBL; CR541904; CAG46702.1; -; mRNA. +DR EMBL; CR541925; CAG46723.1; -; mRNA. +DR EMBL; AC006388; -; NOT_ANNOTATED_CDS; Genomic_DNA. +DR EMBL; BC020963; AAH20963.1; -; mRNA. +DR RefSeq; NP_036611.2; NM_012479.3. +DR PDB; 2B05; X-ray; 2.55 A; A/B/C/D/E/F=2-247. +DR PDB; 3UZD; X-ray; 1.86 A; A=1-247. +DR PDB; 4E2E; X-ray; 2.25 A; A=1-247. +DR PDB; 4J6S; X-ray; 3.08 A; A/B/C/D=1-247. +DR PDB; 4O46; X-ray; 2.90 A; A/B/C/D/E/F=1-247. +DR PDB; 5D3E; X-ray; 2.75 A; A/B/E/F/I/J=1-238. +DR PDB; 6A5S; X-ray; 2.10 A; A/B/D/G=1-247. +DR PDB; 6BYJ; X-ray; 2.90 A; A/B/C/D/E/F=2-241. +DR PDB; 6BYL; X-ray; 3.35 A; A/B/C/D/E/F=2-241. +DR PDB; 6BZD; X-ray; 2.67 A; A/B/C/D=2-247. +DR PDB; 6FEL; X-ray; 2.84 A; A/B/C/D=1-234. +DR PDB; 6GKF; X-ray; 2.60 A; A/B/C/D/E/F/G/H=1-234. +DR PDB; 6GKG; X-ray; 2.85 A; A/B/C/D/E/F/G/H=1-234. +DR PDB; 6S9K; X-ray; 1.60 A; A=1-234. +DR PDB; 6SAD; X-ray; 2.75 A; A/B=1-234. +DR PDB; 6Y4K; X-ray; 3.00 A; A/B=1-234. +DR PDB; 6Y6B; X-ray; 3.08 A; A/B=1-234. +DR PDB; 6ZBT; X-ray; 1.80 A; A/B/C/D=1-234. +DR PDB; 6ZC9; X-ray; 1.90 A; A/B/C/D=1-234. +DR PDB; 7A6R; X-ray; 2.70 A; A/B/C/D=1-234. +DR PDB; 7A6Y; X-ray; 2.50 A; A/B/C/D=1-234. +DR PDBsum; 2B05; -. +DR PDBsum; 3UZD; -. +DR PDBsum; 4E2E; -. +DR PDBsum; 4J6S; -. +DR PDBsum; 4O46; -. +DR PDBsum; 5D3E; -. +DR PDBsum; 6A5S; -. +DR PDBsum; 6BYJ; -. +DR PDBsum; 6BYL; -. +DR PDBsum; 6BZD; -. +DR PDBsum; 6FEL; -. +DR PDBsum; 6GKF; -. +DR PDBsum; 6GKG; -. +DR PDBsum; 6S9K; -. +DR PDBsum; 6SAD; -. +DR PDBsum; 6Y4K; -. +DR PDBsum; 6Y6B; -. +DR PDBsum; 6ZBT; -. +DR PDBsum; 6ZC9; -. +DR PDBsum; 7A6R; -. +DR PDBsum; 7A6Y; -. +DR AlphaFoldDB; P61981; -. +DR SASBDB; P61981; -. +DR SMR; P61981; -. +DR BioGRID; 113364; 1250. +DR CORUM; P61981; -. +DR DIP; DIP-33406N; -. +DR ELM; P61981; -. +DR IntAct; P61981; 1062. +DR MINT; P61981; -. +DR STRING; 9606.ENSP00000306330; -. +DR BindingDB; P61981; -. +DR ChEMBL; CHEMBL1293296; -. +DR TCDB; 8.A.98.1.11; the 14-3-3 protein (14-3-3) family. +DR GlyGen; P61981; 1 site, 1 O-linked glycan (1 site). +DR iPTMnet; P61981; -. +DR MetOSite; P61981; -. +DR PhosphoSitePlus; P61981; -. +DR SwissPalm; P61981; -. +DR BioMuta; YWHAG; -. +DR DMDM; 48428721; -. +DR REPRODUCTION-2DPAGE; IPI00220642; -. +DR CPTAC; CPTAC-450; -. +DR CPTAC; CPTAC-451; -. +DR jPOST; P61981; -. +DR MassIVE; P61981; -. +DR MaxQB; P61981; -. +DR PaxDb; 9606-ENSP00000306330; -. +DR PeptideAtlas; P61981; -. +DR PRIDE; P61981; -. +DR ProteomicsDB; 57355; -. +DR Pumba; P61981; -. +DR TopDownProteomics; P61981; -. +DR Antibodypedia; 4339; 621 antibodies from 41 providers. +DR DNASU; 7532; -. +DR Ensembl; ENST00000307630.5; ENSP00000306330.3; ENSG00000170027.7. +DR GeneID; 7532; -. +DR KEGG; hsa:7532; -. +DR MANE-Select; ENST00000307630.5; ENSP00000306330.3; NM_012479.4; NP_036611.2. +DR AGR; HGNC:12852; -. +DR CTD; 7532; -. +DR DisGeNET; 7532; -. +DR GeneCards; YWHAG; -. +DR HPA; ENSG00000170027; Tissue enhanced (brain, skeletal muscle). +DR MalaCards; YWHAG; -. +DR neXtProt; NX_P61981; -. +DR OpenTargets; ENSG00000170027; -. +DR PharmGKB; PA37441; -. +DR VEuPathDB; HostDB:ENSG00000170027; -. +DR eggNOG; KOG0841; Eukaryota. +DR GeneTree; ENSGT01090000260040; -. +DR HOGENOM; CLU_058290_0_0_1; -. +DR InParanoid; P61981; -. +DR OMA; AYGEAHE; -. +DR OrthoDB; 920089at2759; -. +DR PhylomeDB; P61981; -. +DR TreeFam; TF102003; -. +DR PathwayCommons; P61981; -. +DR SignaLink; P61981; -. +DR SIGNOR; P61981; -. +DR BioGRID-ORCS; 7532; 29 hits in 1163 CRISPR screens. +DR ChiTaRS; YWHAG; human. +DR EvolutionaryTrace; P61981; -. +DR GeneWiki; YWHAG; -. +DR Pharos; P61981; Tchem. +DR PRO; PR:P61981; -. +DR Proteomes; UP000005640; Chromosome 7. +DR RNAct; P61981; Protein. +DR Bgee; ENSG00000170027; Expressed in lateral nuclear group of thalamus and 193 other cell types or tissues. +DR CDD; cd10024; 14-3-3_gamma; 1. +DR Gene3D; 1.20.190.20; 14-3-3 domain; 1. +DR InterPro; IPR000308; 14-3-3. +DR InterPro; IPR023409; 14-3-3_CS. +DR InterPro; IPR036815; 14-3-3_dom_sf. +DR InterPro; IPR023410; 14-3-3_domain. +DR PANTHER; PTHR18860; 14-3-3 PROTEIN; 1. +DR PANTHER; PTHR18860:SF22; 14-3-3 PROTEIN GAMMA; 1. +DR Pfam; PF00244; 14-3-3; 1. +DR PIRSF; PIRSF000868; 14-3-3; 1. +DR PRINTS; PR00305; 1433ZETA. +DR SMART; SM00101; 14_3_3; 1. +DR SUPFAM; SSF48445; 14-3-3 protein; 1. +DR PROSITE; PS00796; 1433_1; 1. +DR PROSITE; PS00797; 1433_2; 1. +PE 1: Evidence at protein level; +KW 3D-structure; Acetylation; Cytoplasm; Direct protein sequencing; +KW Disease variant; Epilepsy; Phosphoprotein; Reference proteome. +SQ SEQUENCE 247 AA; 28303 MW; B0D16C6DE1F4455D CRC64; + MVDREQLVQK ARLAEQAERY DDMAAAMKNV TELNEPLSNE ERNLLSVAYK NVVGARRSSW + RVISSIEQKT SADGNEKKIE MVRAYREKIE KELEAVCQDV LSLLDNYLIK NCSETQYESK + VFYLKMKGDY YRYLAEVATG EKRATVVESS EKAYSEAHEI SKEHMQPTHP IRLGLALNYS + VFYYEIQNAP EQACHLAKTA FDDAIAELDT LNEDSYKDST LIMQLLRDNL TLWTSDQQDD + DGGEGNN +// diff --git a/src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt b/src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt new file mode 100644 index 000000000..23cfd58d6 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt @@ -0,0 +1,570 @@ +ID E5KP32_HUMAN Unreviewed; 535 AA. +AC E5KP32; +DT 08-FEB-2011, integrated into UniProtKB/TrEMBL. +DT 08-FEB-2011, sequence version 1. +DT 29-MAY-2024, entry version 50. +DE RecName: Full=Adenine DNA glycosylase {ECO:0000256|ARBA:ARBA00022023, ECO:0000256|RuleBase:RU365096}; +DE EC=3.2.2.31 {ECO:0000256|ARBA:ARBA00012045, ECO:0000256|RuleBase:RU365096}; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:ADP90939.1}; +DR EMBL; HQ205467; ADP90939.1; -; Genomic_DNA. +DR EMBL; HQ205471; ADP90959.1; -; Genomic_DNA. +DR EMBL; HQ205478; ADP90994.1; -; Genomic_DNA. +DR EMBL; HQ205504; ADP91124.1; -; Genomic_DNA. +DR AlphaFoldDB; E5KP32; -. +DR PeptideAtlas; E5KP32; -. +DR CDD; cd03431; DNA_Glycosylase_C; 1. +DR CDD; cd00056; ENDO3c; 1. +DR Gene3D; 1.10.1670.10; Helix-hairpin-Helix base-excision DNA repair enzymes (C-terminal); 1. +DR Gene3D; 3.90.79.10; Nucleoside Triphosphate Pyrophosphohydrolase; 1. +DR InterPro; IPR011257; DNA_glycosylase. +DR InterPro; IPR004036; Endonuclease-III-like_CS2. +DR InterPro; IPR003651; Endonuclease3_FeS-loop_motif. +DR InterPro; IPR004035; Endouclease-III_FeS-bd_BS. +DR InterPro; IPR003265; HhH-GPD_domain. +DR InterPro; IPR023170; HhH_base_excis_C. +DR InterPro; IPR000445; HhH_motif. +DR InterPro; IPR044298; MIG/MutY. +DR InterPro; IPR029119; MutY_C. +DR InterPro; IPR015797; NUDIX_hydrolase-like_dom_sf. +DR InterPro; IPR000086; NUDIX_hydrolase_dom. +DR PANTHER; PTHR42944; ADENINE DNA GLYCOSYLASE; 1. +DR PANTHER; PTHR42944:SF1; ADENINE DNA GLYCOSYLASE; 1. +DR Pfam; PF00633; HHH; 1. +DR Pfam; PF00730; HhH-GPD; 1. +DR Pfam; PF14815; NUDIX_4; 1. +DR SMART; SM00478; ENDO3c; 1. +DR SMART; SM00525; FES; 1. +DR SUPFAM; SSF48150; DNA-glycosylase; 1. +DR SUPFAM; SSF55811; Nudix; 1. +DR PROSITE; PS00764; ENDONUCLEASE_III_1; 1. +DR PROSITE; PS01155; ENDONUCLEASE_III_2; 1. +DR PROSITE; PS51462; NUDIX; 1. +PE 3: Inferred from homology; +KW 4Fe-4S {ECO:0000256|ARBA:ARBA00022485}; +KW DNA damage {ECO:0000256|ARBA:ARBA00022763, ECO:0000256|RuleBase:RU365096}; +KW DNA repair {ECO:0000256|ARBA:ARBA00023204}; +KW Glycosidase {ECO:0000256|ARBA:ARBA00023295, ECO:0000256|RuleBase:RU365096}; +KW Hydrolase {ECO:0000256|ARBA:ARBA00022801}; +KW Iron {ECO:0000256|ARBA:ARBA00023004, ECO:0000256|RuleBase:RU365096}; +KW Iron-sulfur {ECO:0000256|ARBA:ARBA00023014}; +KW Metal-binding {ECO:0000256|ARBA:ARBA00022723}. +SQ SEQUENCE 535 AA; 59080 MW; 4F7956A45A21226A CRC64; + MTPLVSRLSR LWAIMRKPRA AVGSGHRKQA ASQEGRQKHA KNNSQAKPSA CDGLARQPEE + VVLQASVSSY HLFRDVAEVT AFRGSLLSWY DQEKRDLPWR RRAEDEMDLD RRAYAVWVSE + VMLQQTQVAT VINYYTGWMQ KWPTLQDLAS ASLEEVNQLW AGLGYYSRGR RLQEGARKVV + EELGGHMPRT AETLQQLLPG VGRYTAGAIA SIAFGQATGV VDGNVARVLC RVRAIGADPS + STLVSQQLWG LAQQLVDPAR PGDFNQAAME LGATVCTPQR PLCSQCPVES LCRARQRVEQ + EQLLASGSLS GSPDVEECAP NTGHCHLCLP PSEPWDQTLG VVNFPRKASR KPPREESSAT + CVLEQPGALG AQILLVQRPN SGLLAGLWEF PSVTWEPSEQ LQRKALLQEL QRWAGPLPAT + HLRHLGEVVH TFSHIKLTYQ VYGLALEGQT PVTTVPPGAR WLTQEEFHTA AVSTAMKKVF + RVYQGQQPGT CMGSKRSQVS SPCSRKKPRM GQQVLDNFFR SHISTDAHSL NSAAQ +// +ID A0A1U9X8M5_COW Unreviewed; 395 AA. +AC A0A1U9X8M5; +DT 07-JUN-2017, integrated into UniProtKB/TrEMBL. +DT 07-JUN-2017, sequence version 1. +DT 29-MAY-2024, entry version 41. +DE RecName: Full=Tripartite motif-containing protein 10 {ECO:0000256|ARBA:ARBA00014653}; +OS Bos taurus (Cow). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9913 {ECO:0000313|EMBL:AQY77152.1}; +DR EMBL; KY500679; AQY77151.1; -; Genomic_DNA. +DR EMBL; KY500680; AQY77152.1; -; Genomic_DNA. +DR RefSeq; NP_439893.2; NM_052828.2. +DR AlphaFoldDB; A0A1U9X8M5; -. +DR SMR; A0A1U9X8M5; -. +DR Antibodypedia; 26232; 191 antibodies from 19 providers. +DR DNASU; 10107; -. +DR GeneID; 10107; -. +DR CTD; 10107; -. +DR DisGeNET; 10107; -. +DR VEuPathDB; HostDB:ENSG00000204613; -. +DR OrthoDB; 3453019at2759; -. +DR ExpressionAtlas; A0A1U9X8M5; baseline and differential. +DR CDD; cd16593; RING-HC_TRIM10_C-IV; 1. +DR Gene3D; 2.60.120.920; -; 1. +DR Gene3D; 3.30.160.60; Classic Zinc Finger; 1. +DR Gene3D; 3.30.40.10; Zinc/RING finger domain, C3HC4 (zinc finger); 1. +DR InterPro; IPR001870; B30.2/SPRY. +DR InterPro; IPR043136; B30.2/SPRY_sf. +DR InterPro; IPR003879; Butyrophylin_SPRY. +DR InterPro; IPR013320; ConA-like_dom_sf. +DR InterPro; IPR006574; PRY. +DR InterPro; IPR042784; TRIM10_RING-HC. +DR InterPro; IPR000315; Znf_B-box. +DR InterPro; IPR018957; Znf_C3HC4_RING-type. +DR InterPro; IPR001841; Znf_RING. +DR InterPro; IPR013083; Znf_RING/FYVE/PHD. +DR InterPro; IPR017907; Znf_RING_CS. +DR PANTHER; PTHR24103; E3 UBIQUITIN-PROTEIN LIGASE TRIM; 1. +DR PANTHER; PTHR24103:SF329; TRIPARTITE MOTIF-CONTAINING PROTEIN 10; 1. +DR Pfam; PF13765; PRY; 1. +DR Pfam; PF00643; zf-B_box; 1. +DR Pfam; PF00097; zf-C3HC4; 1. +DR PRINTS; PR01407; BUTYPHLNCDUF. +DR SMART; SM00336; BBOX; 1. +DR SMART; SM00589; PRY; 1. +DR SMART; SM00184; RING; 1. +DR SUPFAM; SSF57845; B-box zinc-binding domain; 1. +DR SUPFAM; SSF49899; Concanavalin A-like lectins/glucanases; 1. +DR SUPFAM; SSF57850; RING/U-box; 1. +DR PROSITE; PS50188; B302_SPRY; 1. +DR PROSITE; PS50119; ZF_BBOX; 1. +DR PROSITE; PS00518; ZF_RING_1; 1. +DR PROSITE; PS50089; ZF_RING_2; 1. +PE 3: Inferred from homology; +KW Metal-binding {ECO:0000256|ARBA:ARBA00022723}; +KW Zinc {ECO:0000256|ARBA:ARBA00022833}; +KW Zinc-finger {ECO:0000256|ARBA:ARBA00022771, ECO:0000256|PROSITE- +KW ProRule:PRU00024}. +SQ SEQUENCE 395 AA; 45252 MW; EDEFCB7027B6C15D CRC64; + MASAASVTSL ADEVNCPICQ GTLREPVTID CGHNFCRACL TRYCEIPGPD LEESPTCPLC + KEPFRPGSFR PNWQLANVVE NIERLQLVST LGLGEEDVCQ EHGEKIYFFC EDDEMQLCVV + CREAGEHATH TMRFLEDAAA PYREQIHKCL KCLRKEREEI QEIQSRENKR MQVLLTQVST + KRQQVISEFA HLRKFLEEQQ SILLAQLESQ DGDILRQRDE FDLLVAGEIC RFSALIEELE + EKNERPAREL LTDIRSTLIR CETRKCRKPV AVSPELGQRI RDFPQQALPL QREMKMFLEK + LCFELDYEPA HISLDPQTSH PKLLLSEDHQ RAQFSYKWQN SPDNPQRFDR ATCVLAHTGI + TGGRHTWVWM ARVPGDSGCC QFCSPPSVLG TEVAA +// +ID A0A7D5YZ42_HUMAN Unreviewed; 106 AA. +AC A0A7D5YZ42; +DT 02-DEC-2020, integrated into UniProtKB/TrEMBL. +DT 02-DEC-2020, sequence version 1. +DT 29-MAY-2024, entry version 12. +DE SubName: Full=Cytochrome P450 2C9 {ECO:0000313|EMBL:QLI62784.1}; +DE Flags: Fragment; +GN Name=CYP2C9 {ECO:0000313|EMBL:QLI62784.1}; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:QLI62784.1}; +DR EMBL; MN614169; QLI62674.1; -; Genomic_DNA. +DR EMBL; MN614175; QLI62680.1; -; Genomic_DNA. +DR EMBL; MN614279; QLI62784.1; -; Genomic_DNA. +DR AlphaFoldDB; A0A7D5YZ42; -. +DR PeptideAtlas; A0A7D5YZ42; -. +DR Gene3D; 1.10.630.10; Cytochrome P450; 1. +DR InterPro; IPR001128; Cyt_P450. +DR InterPro; IPR002401; Cyt_P450_E_grp-I. +DR InterPro; IPR036396; Cyt_P450_sf. +DR PANTHER; PTHR24300:SF336; CYTOCHROME P450 2C9; 1. +DR PANTHER; PTHR24300; CYTOCHROME P450 508A4-RELATED; 1. +DR Pfam; PF00067; p450; 1. +DR PRINTS; PR00463; EP450I. +DR SUPFAM; SSF48264; Cytochrome P450; 1. +PE 3: Inferred from homology; +KW Heme {ECO:0000256|ARBA:ARBA00022617}; Iron {ECO:0000256|ARBA:ARBA00023004}; +KW Metal-binding {ECO:0000256|ARBA:ARBA00022723}. +SQ SEQUENCE 106 AA; 12035 MW; 9894A14D4BE1A349 CRC64; + LSKVYGPVFT LYFGLKPIVV LHGYEAVKEA LIDLGEEFSG RGIFPLAERA NRGFGIVFSN + GKKWKEIRHF SLMTLRNFGM GKRSIEDRVQ EEARCLVEEL RKTKGG +// +ID K4GY12_HUMAN Unreviewed; 226 AA. +AC K4GY12; +DT 09-JAN-2013, integrated into UniProtKB/TrEMBL. +DT 09-JAN-2013, sequence version 1. +DT 29-MAY-2024, entry version 40. +DE RecName: Full=ATP synthase subunit a {ECO:0000256|ARBA:ARBA00021312, ECO:0000256|RuleBase:RU004450}; +GN Name=ATP6 {ECO:0000313|EMBL:AFP96372.1}; +OS Homo sapiens (Human). +OG Mitochondrion {ECO:0000313|EMBL:AFP96372.1}. +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:AFP96372.1}; +DR EMBL; JX289125; AFP96372.1; -; Genomic_DNA. +DR EMBL; MF621083; ATP05077.1; -; Genomic_DNA. +DR AlphaFoldDB; K4GY12; -. +DR PeptideAtlas; K4GY12; -. +DR ChiTaRS; ATP6; human. +DR CDD; cd00310; ATP-synt_Fo_a_6; 1. +DR Gene3D; 1.20.120.220; ATP synthase, F0 complex, subunit A; 1. +DR InterPro; IPR000568; ATP_synth_F0_asu. +DR InterPro; IPR023011; ATP_synth_F0_asu_AS. +DR InterPro; IPR045083; ATP_synth_F0_asu_bact/mt. +DR InterPro; IPR035908; F0_ATP_A_sf. +DR NCBIfam; TIGR01131; ATP_synt_6_or_A; 1. +DR PANTHER; PTHR11410; ATP SYNTHASE SUBUNIT A; 1. +DR PANTHER; PTHR11410:SF0; ATP SYNTHASE SUBUNIT A; 1. +DR Pfam; PF00119; ATP-synt_A; 1. +DR PRINTS; PR00123; ATPASEA. +DR SUPFAM; SSF81336; F1F0 ATP synthase subunit A; 1. +DR PROSITE; PS00449; ATPASE_A; 1. +PE 3: Inferred from homology; +KW ATP synthesis {ECO:0000256|ARBA:ARBA00023310}; +KW CF(0) {ECO:0000256|ARBA:ARBA00022547}; +KW Hydrogen ion transport {ECO:0000256|ARBA:ARBA00022781}; +KW Ion transport {ECO:0000256|ARBA:ARBA00023065}; +KW Membrane {ECO:0000256|ARBA:ARBA00023136, ECO:0000256|SAM:Phobius}; +KW Mitochondrion {ECO:0000256|ARBA:ARBA00023128, ECO:0000313|EMBL:AFP96372.1}; +KW Mitochondrion inner membrane {ECO:0000256|ARBA:ARBA00022792}; +KW Transmembrane {ECO:0000256|ARBA:ARBA00022692, ECO:0000256|SAM:Phobius}; +KW Transmembrane helix {ECO:0000256|ARBA:ARBA00022989, +KW ECO:0000256|SAM:Phobius}; Transport {ECO:0000256|ARBA:ARBA00022448}. +SQ SEQUENCE 226 AA; 24785 MW; 7211E3A429C0D966 CRC64; + MNENLFASFI APTILGLPAA VLIILFPPLL IPTSKYLINN RLITTQQWLI KLTSKQMMAM + HNTKGRTWSL MLVSLIIFIA TTNLLGLLPH SFTPTTQLSM NLAMAIPLWV GAVIMGFRSK + IKNALAHFLP QGTPTPLIPM LVIIETISLL IQPMALAVRL TANITAGHLL MHLIGSATLA + MSTINLPSTL IIFTILILLT ILEIAVALIQ AYVFTLLVSL YLHDNT +// +ID A0A383S2L7_HUMAN Unreviewed; 366 AA. +AC A0A383S2L7; +DT 07-NOV-2018, integrated into UniProtKB/TrEMBL. +DT 07-NOV-2018, sequence version 1. +DT 29-MAY-2024, entry version 26. +DE SubName: Full=MHC class I antigen {ECO:0000313|EMBL:SYY42737.1}; +GN Name=HLA-C {ECO:0000313|EMBL:SYY42737.1}; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:SYY42737.1}; +DR EMBL; MK005641; AZL48948.1; -; Genomic_DNA. +DR EMBL; LS992437; SYY42737.1; -; Genomic_DNA. +DR AlphaFoldDB; A0A383S2L7; -. +DR PeptideAtlas; A0A383S2L7; -. +DR ChiTaRS; HLA-C; human. +DR CDD; cd21025; IgC1_MHC_Ib_HLA-Cw3-4; 1. +DR Gene3D; 2.60.40.10; Immunoglobulins; 1. +DR Gene3D; 3.30.500.10; MHC class I-like antigen recognition-like; 1. +DR InterPro; IPR007110; Ig-like_dom. +DR InterPro; IPR036179; Ig-like_dom_sf. +DR InterPro; IPR013783; Ig-like_fold. +DR InterPro; IPR003006; Ig/MHC_CS. +DR InterPro; IPR003597; Ig_C1-set. +DR InterPro; IPR011161; MHC_I-like_Ag-recog. +DR InterPro; IPR037055; MHC_I-like_Ag-recog_sf. +DR InterPro; IPR011162; MHC_I/II-like_Ag-recog. +DR InterPro; IPR001039; MHC_I_a_a1/a2. +DR InterPro; IPR010579; MHC_I_a_C. +DR PANTHER; PTHR16675:SF252; HLA CLASS I HISTOCOMPATIBILITY ANTIGEN, C ALPHA CHAIN; 1. +DR PANTHER; PTHR16675; MHC CLASS I-RELATED; 1. +DR Pfam; PF07654; C1-set; 1. +DR Pfam; PF00129; MHC_I; 1. +DR Pfam; PF06623; MHC_I_C; 1. +DR PRINTS; PR01638; MHCCLASSI. +DR SMART; SM00407; IGc1; 1. +DR SUPFAM; SSF48726; Immunoglobulin; 1. +DR SUPFAM; SSF54452; MHC antigen-recognition domain; 1. +DR PROSITE; PS50835; IG_LIKE; 1. +DR PROSITE; PS00290; IG_MHC; 1. +PE 3: Inferred from homology; +KW Disulfide bond {ECO:0000256|ARBA:ARBA00023157}; +KW Glycoprotein {ECO:0000256|ARBA:ARBA00023180}; +KW Membrane {ECO:0000256|SAM:Phobius}; +KW Signal {ECO:0000256|ARBA:ARBA00022729, ECO:0000256|SAM:SignalP}; +KW Transmembrane {ECO:0000256|SAM:Phobius}; +KW Transmembrane helix {ECO:0000256|SAM:Phobius}. +SQ SEQUENCE 366 AA; 40967 MW; 8482C454FA80E378 CRC64; + MRVMAPRTLI LLLSGALALT ETWACSHSMR YFSTSVSRPG RWEPRFIAVG YVDDTQFVRF + DSDAASPRGE PRAPWVEQEG PEYWDRETQK YKRQAQTDRV SLRNLRGYYN QSEAGSHTLQ + WMFGCDLGPD GRLLRGYDQS AYDGKDYIAL NEDLRSWTAA DTAAQITQRK WEAAREAEQR + RAYLEGTCVE WLRRYLENGK ETLQRAEHPK THVTHHPVSD HEATLRCWAL GFYPAEITLT + WQWDGEDQTQ DTELVETRPA GDGTFQKWAA VVVPSGEEQR YTCHVQHEGL PEPLTLRWEP + SSQPTIPIVG IVAGLAVLAV LAVLGAVVAV VMCRRKSSGG KGGSCSQAAS SNSAQGSDES + LIACKA +// +ID O19554_HUMAN Unreviewed; 362 AA. +AC O19554; +DT 01-JAN-1998, integrated into UniProtKB/TrEMBL. +DT 01-JAN-1998, sequence version 1. +DT 29-MAY-2024, entry version 165. +DE SubName: Full=MHC class I antigen {ECO:0000313|EMBL:AAC17467.1}; +GN Name=HLA-B {ECO:0000313|EMBL:AAC17467.1}; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:AAC17467.1}; +DR EMBL; AF016641; AAC17467.1; -; mRNA. +DR EMBL; KX555571; ART85731.1; -; Genomic_DNA. +DR EMBL; LT618830; SCQ83646.1; -; Genomic_DNA. +DR PIR; S24434; S24434. +DR PIR; S24435; S24435. +DR PIR; S24436; S24436. +DR PIR; S24437; S24437. +DR PIR; S24439; S24439. +DR PIR; S24440; S24440. +DR AlphaFoldDB; O19554; -. +DR PeptideAtlas; O19554; -. +DR ChiTaRS; HLA-B; human. +DR CDD; cd21026; IgC1_MHC_Ia_HLA-B; 1. +DR Gene3D; 2.60.40.10; Immunoglobulins; 1. +DR Gene3D; 3.30.500.10; MHC class I-like antigen recognition-like; 1. +DR InterPro; IPR007110; Ig-like_dom. +DR InterPro; IPR036179; Ig-like_dom_sf. +DR InterPro; IPR013783; Ig-like_fold. +DR InterPro; IPR003006; Ig/MHC_CS. +DR InterPro; IPR003597; Ig_C1-set. +DR InterPro; IPR011161; MHC_I-like_Ag-recog. +DR InterPro; IPR037055; MHC_I-like_Ag-recog_sf. +DR InterPro; IPR011162; MHC_I/II-like_Ag-recog. +DR InterPro; IPR001039; MHC_I_a_a1/a2. +DR InterPro; IPR010579; MHC_I_a_C. +DR PANTHER; PTHR16675:SF270; HLA CLASS I HISTOCOMPATIBILITY ANTIGEN, B ALPHA CHAIN; 1. +DR PANTHER; PTHR16675; MHC CLASS I-RELATED; 1. +DR Pfam; PF07654; C1-set; 1. +DR Pfam; PF00129; MHC_I; 1. +DR Pfam; PF06623; MHC_I_C; 1. +DR PRINTS; PR01638; MHCCLASSI. +DR SMART; SM00407; IGc1; 1. +DR SUPFAM; SSF48726; Immunoglobulin; 1. +DR SUPFAM; SSF54452; MHC antigen-recognition domain; 1. +DR PROSITE; PS50835; IG_LIKE; 1. +DR PROSITE; PS00290; IG_MHC; 1. +PE 2: Evidence at transcript level; +KW Disulfide bond {ECO:0000256|ARBA:ARBA00023157}; +KW Glycoprotein {ECO:0000256|ARBA:ARBA00023180}; +KW Membrane {ECO:0000256|SAM:Phobius}; +KW Signal {ECO:0000256|ARBA:ARBA00022729, ECO:0000256|SAM:SignalP}; +KW Transmembrane {ECO:0000256|SAM:Phobius}; +KW Transmembrane helix {ECO:0000256|SAM:Phobius}. +SQ SEQUENCE 362 AA; 40438 MW; 8BAA65B28D3BA262 CRC64; + MRVTAPRTVL LLLSGALALT ETWAGSHSMR YFYTAMSRPG RGEPRFISVG YVDDTQFVRF + DSDAASPREE PRAPWIEQEG PEYWDRNTQI CKTNTQTYRE SLRNLRGYYN QSEAGSHTLQ + RMYGCDVGPD GRLLRGHDQY AYDGKDYIAL NEDLSSWTAA DTAAQITQRK WEAAREAEQL + RAYLEGLCVE WLRRHLENGK ETLQRADPPK THVTHHPISD HEATLRCWAL GFYPAEITLT + WQRDGEDQTQ DTELVETRPA GDRTFQKWAA VVVPSGEEQR YTCHVQHEGL PKPLTLRWEP + SSQSTIPIVG IVAGLAVLAV VVIGAVVATV MCRRKSSGGK GGSYSQAASS DSAQGSDVSL + TA +// +ID H6WGQ1_HUMAN Unreviewed; 603 AA. +AC H6WGQ1; +DT 18-APR-2012, integrated into UniProtKB/TrEMBL. +DT 18-APR-2012, sequence version 1. +DT 29-MAY-2024, entry version 65. +DE RecName: Full=NADH-ubiquinone oxidoreductase chain 5 {ECO:0000256|ARBA:ARBA00021096, ECO:0000256|RuleBase:RU003404}; +DE EC=7.1.1.2 {ECO:0000256|ARBA:ARBA00012944, ECO:0000256|RuleBase:RU003404}; +GN Name=ND5 {ECO:0000313|EMBL:AEY70993.1}; +OS Homo sapiens (Human). +OG Mitochondrion {ECO:0000313|EMBL:AEY70993.1}. +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:AEY70993.1}; +DR EMBL; JQ245734; AEY70993.1; -; Genomic_DNA. +DR EMBL; JQ245735; AEY71006.1; -; Genomic_DNA. +DR EMBL; JX135002; AFN20958.1; -; Genomic_DNA. +DR EMBL; KF450844; AGZ64615.1; -; Genomic_DNA. +DR EMBL; KJ446052; AHX46740.1; -; Genomic_DNA. +DR EMBL; KP763844; AKB98866.1; -; Genomic_DNA. +DR EMBL; MF437284; QBZ76936.1; -; Genomic_DNA. +DR AlphaFoldDB; H6WGQ1; -. +DR PeptideAtlas; H6WGQ1; -. +DR ChiTaRS; MT-ND5; human. +DR InterPro; IPR010934; NADH_DH_su5_C. +DR InterPro; IPR018393; NADHpl_OxRdtase_5_subgr. +DR InterPro; IPR001750; ND/Mrp_mem. +DR InterPro; IPR003945; NU5C-like. +DR InterPro; IPR001516; Proton_antipo_N. +DR NCBIfam; TIGR01974; NDH_I_L; 1. +DR PANTHER; PTHR42829; NADH-UBIQUINONE OXIDOREDUCTASE CHAIN 5; 1. +DR PANTHER; PTHR42829:SF2; NADH-UBIQUINONE OXIDOREDUCTASE CHAIN 5; 1. +DR Pfam; PF06455; NADH5_C; 1. +DR Pfam; PF00361; Proton_antipo_M; 1. +DR Pfam; PF00662; Proton_antipo_N; 1. +DR PRINTS; PR01434; NADHDHGNASE5. +PE 3: Inferred from homology; +KW Electron transport {ECO:0000256|ARBA:ARBA00022982}; +KW Membrane {ECO:0000256|ARBA:ARBA00023136, ECO:0000256|RuleBase:RU003404}; +KW Mitochondrion {ECO:0000256|ARBA:ARBA00023128, +KW ECO:0000256|RuleBase:RU003404}; +KW NAD {ECO:0000256|ARBA:ARBA00023027, ECO:0000256|RuleBase:RU003404}; +KW Respiratory chain {ECO:0000256|ARBA:ARBA00022660}; +KW Signal {ECO:0000256|SAM:SignalP}; +KW Translocase {ECO:0000256|ARBA:ARBA00022967}; +KW Transmembrane {ECO:0000256|ARBA:ARBA00022692, +KW ECO:0000256|RuleBase:RU003404}; +KW Transmembrane helix {ECO:0000256|ARBA:ARBA00022989, +KW ECO:0000256|RuleBase:RU003404}; +KW Transport {ECO:0000256|ARBA:ARBA00022448, ECO:0000256|RuleBase:RU003404}; +KW Ubiquinone {ECO:0000256|RuleBase:RU003404}. +SQ SEQUENCE 603 AA; 66955 MW; 897749F5B5EA5860 CRC64; + MTMHTTMTAL TLTSLIPPIL TTLVNPNKKN SYPHYVKSIV ASTFIISLFP TTMFMCLDQE + VIISNWHWAT TQTTQLSLSF KLDYFSMMFI PVALFVTWSI MEFSLWYMNS DPNINQFFKY + LLIFLITMLI LVTANNLFQL FIGWEGVGIM SFLLISWWYA RADANTAAIQ AILYNRIGDI + GFILALAWFI LHSNSWDPQQ MALLNANPSL TPLLGLLLAA AGKSAQLGLH PWLPSAMEGP + TPVSALLHSS TMVVAGIFLL IRFHPLAENS PLIQTLTLCL GAITTLFAAV CALTQNDIKK + IVAFSTSSQL GLMMVTIGIN QPHLAFLHIC THAFFKAMLF MCSGSIIHNL NNEQDIRKMG + GLLKTMPLTS TSLTIGSLAL AGMPFLTGFY SKDHIIETAN MSYTNAWALS ITLIATSLTS + AYSTRMILLT LTGQPRFPTL TNINENNPTL LNPIKRLAAG SLFAGFLITN SISPASPFQT + TVPLYLKLTA LAVTFLGLLT ALDLNYLTNK LKMKSPLCTF YFSNMLGFYP SITHRTIPYL + GLLTSQNLPL LLLDLTWLEK LLPKTISQHQ ISTSIITSTQ KGMIKLYFLS FFFPLILTLL + LIT +// +ID A0A1U9X8F2_HUMAN Unreviewed; 406 AA. +AC A0A1U9X8F2; +DT 07-JUN-2017, integrated into UniProtKB/TrEMBL. +DT 07-JUN-2017, sequence version 1. +DT 29-MAY-2024, entry version 45. +DE RecName: Full=RING-type E3 ubiquitin transferase {ECO:0000256|ARBA:ARBA00012483}; +DE EC=2.3.2.27 {ECO:0000256|ARBA:ARBA00012483}; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:AQY77073.1}; +DR EMBL; KY500599; AQY77071.1; -; Genomic_DNA. +DR EMBL; KY500600; AQY77072.1; -; Genomic_DNA. +DR EMBL; KY500601; AQY77073.1; -; Genomic_DNA. +DR RefSeq; NP_002922.2; NM_002931.3. +DR AlphaFoldDB; A0A1U9X8F2; -. +DR SMR; A0A1U9X8F2; -. +DR Antibodypedia; 1772; 641 antibodies from 36 providers. +DR DNASU; 6015; -. +DR GeneID; 6015; -. +DR KEGG; hsa:6015; -. +DR CTD; 6015; -. +DR DisGeNET; 6015; -. +DR VEuPathDB; HostDB:ENSG00000204227; -. +DR OMA; GAEDNCD; -. +DR OrthoDB; 460116at2759; -. +DR UniPathway; UPA00143; -. +DR ExpressionAtlas; A0A1U9X8F2; baseline and differential. +DR CDD; cd17166; RAWUL_RING1; 1. +DR CDD; cd16740; RING-HC_RING2; 1. +DR Gene3D; 3.30.40.10; Zinc/RING finger domain, C3HC4 (zinc finger); 1. +DR InterPro; IPR032443; RAWUL. +DR InterPro; IPR043540; RING1/RING2. +DR InterPro; IPR001841; Znf_RING. +DR InterPro; IPR013083; Znf_RING/FYVE/PHD. +DR InterPro; IPR017907; Znf_RING_CS. +DR PANTHER; PTHR46076:SF5; E3 UBIQUITIN-PROTEIN LIGASE RING1; 1. +DR PANTHER; PTHR46076; E3 UBIQUITIN-PROTEIN LIGASE RING1 / RING 2 FAMILY MEMBER; 1. +DR Pfam; PF16207; RAWUL; 1. +DR Pfam; PF13923; zf-C3HC4_2; 1. +DR SMART; SM00184; RING; 1. +DR SUPFAM; SSF57850; RING/U-box; 1. +DR PROSITE; PS00518; ZF_RING_1; 1. +DR PROSITE; PS50089; ZF_RING_2; 1. +PE 4: Predicted; +KW Metal-binding {ECO:0000256|ARBA:ARBA00022723}; +KW Nucleus {ECO:0000256|ARBA:ARBA00023242}; +KW Zinc {ECO:0000256|ARBA:ARBA00022833}; +KW Zinc-finger {ECO:0000256|ARBA:ARBA00022771, ECO:0000256|PROSITE- +KW ProRule:PRU00175}. +SQ SEQUENCE 406 AA; 42429 MW; 6959787479DE9DAB CRC64; + MTTPANAQNA SKTWELSLYE LHRTPQEAIM DGTEIAVSPR SLHSELMCPI CLDMLKNTMT + TKECLHRFCS DCIVTALRSG NKECPTCRKK LVSKRSLRPD PNFDALISKI YPSREEYEAH + QDRVLIRLSR LHNQQALSSS IEEGLRMQAM HRAQRVRRPI PGSDQTTTMS GGEGEPGEGE + GDGEDVSSDS APDSAPGPAP KRPRGGGAGG SSVGTGGGGT GGVGGGAGSE DSGDRGGTLG + GGTLGPPSPP GAPSPPEPGG EIELVFRPHP LLVEKGEYCQ TRYVKTTGNA TVDHLSKYLA + LRIALERRQQ QEAGEPGGPG GGASDTGGPD GCGGEGGGAG GGDGPEEPAL PSLEGVSEKQ + YTIYIAPGGG AFTTLNGSLT LELVNEKFWK VSRPLELCYA PTKDPK +// +ID Q4F4R7_HUMAN Unreviewed; 226 AA. +AC Q4F4R7; +DT 30-AUG-2005, integrated into UniProtKB/TrEMBL. +DT 30-AUG-2005, sequence version 1. +DT 29-MAY-2024, entry version 122. +DE RecName: Full=ATP synthase subunit a {ECO:0000256|ARBA:ARBA00021312, ECO:0000256|RuleBase:RU004450}; +GN Name=ATP6 {ECO:0000313|EMBL:AAZ00441.2}; +OS Homo sapiens (Human). +OG Mitochondrion {ECO:0000313|EMBL:AAZ00441.2}. +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:AAZ00441.2}; +DR EMBL; DQ112736; AAZ00441.2; -; Genomic_DNA. +DR EMBL; DQ305032; ABB99315.1; -; Genomic_DNA. +DR EMBL; DQ341077; ABC60775.1; -; Genomic_DNA. +DR EMBL; EU092680; ABU64269.1; -; Genomic_DNA. +DR EMBL; FJ625860; ACO92279.1; -; Genomic_DNA. +DR EMBL; GU455415; ADD20772.1; -; Genomic_DNA. +DR EMBL; JN655777; AEQ26443.1; -; Genomic_DNA. +DR EMBL; JQ044831; AEV47375.1; -; Genomic_DNA. +DR EMBL; JQ703621; AFF85036.1; -; Genomic_DNA. +DR EMBL; KC622073; AGJ95426.1; -; Genomic_DNA. +DR EMBL; KC911360; AGO91425.1; -; Genomic_DNA. +DR EMBL; KF011503; AGQ46203.1; -; Genomic_DNA. +DR EMBL; KF055329; AGS17760.1; -; Genomic_DNA. +DR EMBL; KF451436; AGZ72306.1; -; Genomic_DNA. +DR EMBL; KJ185406; AID07692.1; -; Genomic_DNA. +DR EMBL; KM101583; AIU57283.1; -; Genomic_DNA. +DR EMBL; KT819211; ALM03295.1; -; Genomic_DNA. +DR EMBL; MF055891; ASR95513.1; -; Genomic_DNA. +DR EMBL; MF621071; ATP04921.1; -; Genomic_DNA. +DR EMBL; KY797207; AUG82832.1; -; Genomic_DNA. +DR EMBL; MF381288; AUR39889.1; -; Genomic_DNA. +DR EMBL; MF696005; AUT79179.1; -; Genomic_DNA. +DR EMBL; MH981647; AYV90620.1; -; Genomic_DNA. +DR PeptideAtlas; Q4F4R7; -. +DR ChiTaRS; ATP6; human. +DR CDD; cd00310; ATP-synt_Fo_a_6; 1. +DR Gene3D; 1.20.120.220; ATP synthase, F0 complex, subunit A; 1. +DR InterPro; IPR000568; ATP_synth_F0_asu. +DR InterPro; IPR023011; ATP_synth_F0_asu_AS. +DR InterPro; IPR045083; ATP_synth_F0_asu_bact/mt. +DR InterPro; IPR035908; F0_ATP_A_sf. +DR NCBIfam; TIGR01131; ATP_synt_6_or_A; 1. +DR PANTHER; PTHR11410; ATP SYNTHASE SUBUNIT A; 1. +DR PANTHER; PTHR11410:SF0; ATP SYNTHASE SUBUNIT A; 1. +DR Pfam; PF00119; ATP-synt_A; 1. +DR PRINTS; PR00123; ATPASEA. +DR SUPFAM; SSF81336; F1F0 ATP synthase subunit A; 1. +DR PROSITE; PS00449; ATPASE_A; 1. +PE 3: Inferred from homology; +KW ATP synthesis {ECO:0000256|ARBA:ARBA00023310}; +KW CF(0) {ECO:0000256|ARBA:ARBA00022547}; +KW Hydrogen ion transport {ECO:0000256|ARBA:ARBA00022781}; +KW Ion transport {ECO:0000256|ARBA:ARBA00023065}; +KW Membrane {ECO:0000256|ARBA:ARBA00023136, ECO:0000256|SAM:Phobius}; +KW Mitochondrion {ECO:0000256|ARBA:ARBA00023128, ECO:0000313|EMBL:AAZ00441.2}; +KW Mitochondrion inner membrane {ECO:0000256|ARBA:ARBA00022792}; +KW Transmembrane {ECO:0000256|ARBA:ARBA00022692, ECO:0000256|SAM:Phobius}; +KW Transmembrane helix {ECO:0000256|ARBA:ARBA00022989, +KW ECO:0000256|SAM:Phobius}; Transport {ECO:0000256|ARBA:ARBA00022448}. +SQ SEQUENCE 226 AA; 24747 MW; ADC1F79724D46108 CRC64; + MNENLFASFI APTILGLPAA VLIILFPPLL IPTSKYLINN RLITTQQWLI KLTSKQMMAM + HNTKGRTWSL MLVSLIIFIA TTNLLGLLPH SFTPTTQLSM NLAMAIPLWA GAVIMGFRSK + IKNALAHFLP QGTPTSLIPM LVIIETISLL IQPMALAVRL TANITAGHLL MHLIGSATLA + MSTINLPSTL IIFTILILLT ILEIAVALIQ AYVFTLLVSL YLHDNT +// +ID A4ZMD8_HUMAN Unreviewed; 174 AA. +AC A4ZMD8; +DT 29-MAY-2007, integrated into UniProtKB/TrEMBL. +DT 29-MAY-2007, sequence version 1. +DT 29-MAY-2024, entry version 75. +DE RecName: Full=NADH-ubiquinone oxidoreductase chain 6 {ECO:0000256|ARBA:ARBA00021095, ECO:0000256|RuleBase:RU004430}; +DE EC=7.1.1.2 {ECO:0000256|ARBA:ARBA00012944, ECO:0000256|RuleBase:RU004430}; +GN Name=ND6 {ECO:0000313|EMBL:ABO39784.1}; +OS Homo sapiens (Human). +OG Mitochondrion {ECO:0000313|EMBL:ABO39784.1}. +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; +OC Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; +OC Homo. +OX NCBI_TaxID=9606 {ECO:0000313|EMBL:ABO39784.1}; +DR EMBL; EF184623; ABO39784.1; -; Genomic_DNA. +DR EMBL; GU296592; ADB05633.1; -; Genomic_DNA. +DR AlphaFoldDB; A4ZMD8; -. +DR PeptideAtlas; A4ZMD8; -. +DR ChiTaRS; MT-ND6; human. +DR InterPro; IPR001457; NADH_UbQ/plastoQ_OxRdtase_su6. +DR PANTHER; PTHR11435; NADH UBIQUINONE OXIDOREDUCTASE SUBUNIT ND6; 1. +DR PANTHER; PTHR11435:SF1; NADH-UBIQUINONE OXIDOREDUCTASE CHAIN 6; 1. +DR Pfam; PF00499; Oxidored_q3; 1. +PE 3: Inferred from homology; +KW Electron transport {ECO:0000256|RuleBase:RU004430}; +KW Membrane {ECO:0000256|RuleBase:RU004430}; +KW Mitochondrion {ECO:0000256|RuleBase:RU004430, ECO:0000313|EMBL:ABO39784.1}; +KW Mitochondrion inner membrane {ECO:0000256|ARBA:ARBA00022792}; +KW NAD {ECO:0000256|RuleBase:RU004430}; +KW Respiratory chain {ECO:0000256|RuleBase:RU004430}; +KW Signal {ECO:0000256|SAM:SignalP}; +KW Translocase {ECO:0000256|RuleBase:RU004430}; +KW Transmembrane {ECO:0000256|RuleBase:RU004430}; +KW Transmembrane helix {ECO:0000256|RuleBase:RU004430}; +KW Transport {ECO:0000256|ARBA:ARBA00022448, ECO:0000256|RuleBase:RU004430}; +KW Ubiquinone {ECO:0000256|RuleBase:RU004430}. +SQ SEQUENCE 174 AA; 18594 MW; E50CD9EB4DAC600D CRC64; + MMYALFLLSV GLVMGFVGFS SKPSPIYGGL VLIVSGVVGC VIILNFGGGY MGLMVFLIYL + GGMMVVFGYT TAMAIEEYPE AWGSGVEVLV SVLVGLAMEV GLVLWVKEYD GVVVVVNFNS + VGSWMIYEGE GSGLIREDPI GAGALYDYGR WLVVATGWTL FVGVYIVIEI ARGN +// diff --git a/src/python/test/xrefs/parsers/flatfiles/vgnc.txt b/src/python/test/xrefs/parsers/flatfiles/vgnc.txt new file mode 100644 index 000000000..b3b789376 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/vgnc.txt @@ -0,0 +1,11 @@ +taxon_id vgnc_id symbol name locus_group locus_type status location location_sortable: alias_symbol alias_name prev_symbol prev_name gene_family gene_family_id date_approved_reserved date_symbol_changed date_name_changed date_modified entrez_id ensembl_gene_id uniprot_ids +9796 VGNC:15375 AP1M1 adaptor related protein complex 1 subunit mu 1 protein-coding gene gene with protein product Approved 21 021 "adaptor related protein complex 1 mu 1 subunit" 2017-08-03 2018-04-25 2018-04-25 100069477 ENSECAG00000019502 "F7E1T6" +9796 VGNC:18055 FKBP8 FKBP prolyl isomerase 8 protein-coding gene gene with protein product Approved 21 021 "FK506 binding protein 8" 2017-08-03 2018-11-03 2018-11-03 100069569 ENSECAG00000012306 "F6RVH1" +9796 VGNC:18834 HOMER3 homer scaffold protein 3 protein-coding gene gene with protein product Approved 21 021 "homer scaffolding protein 3" 2017-08-03 2018-03-02 2018-03-02 100146769 ENSECAG00000023105 "F6VIN3" +9598 VGNC:14659 CYYR1 cysteine and tyrosine rich 1 protein-coding gene gene with protein product Approved 21 021 test_synonym 2017-01-05 2017-01-05 744581 ENSPTRG00000013812 "A0A2I3T968|H2QKV9" +9598 VGNC:3738 DIP2A disco interacting protein 2 homolog A protein-coding gene gene with protein product Approved 21 021 DIP2 2015-11-23 2015-11-23 2015-11-23 458619 ENSPTRG00000014025 "A0A2I3RVQ1|A0A2I3S1A1|K7CC60" +9598 VGNC:14660 DNAJC28 DnaJ heat shock protein family (Hsp40) member C28 protein-coding gene gene with protein product Approved 21 021 2017-01-05 2017-01-05 474095 ENSPTRG00000013870 "H2RAJ2" +9598 VGNC:14661 DNMT3L DNA methyltransferase 3 like protein-coding gene gene with protein product Approved 21 021 2017-01-05 2016-06-28 2017-01-05 470099 ENSPTRG00000013973 "H2QL42" +9598 VGNC:1158 DONSON downstream neighbor of SON protein-coding gene gene with protein product Approved 21 021 2015-11-10 2015-11-10 746652 ENSPTRG00000034273 "A0A2I3SLQ4|A0A2I3TCH4|G2HF01" +9598 VGNC:1152 DSCAM DS cell adhesion molecule protein-coding gene gene with protein product Approved 21 021 2015-11-10 2016-05-16 2016-05-16 747803 ENSPTRG00000013922 "A0A2I3RHK5|A0A2I3T5L1|H2QL16" +9796 VGNC:23003 MTREX Mtr4 exosome RNA helicase protein-coding gene gene with protein product Approved 21 021 "SKIV2L2" "Ski2 like RNA helicase 2" 2017-08-03 2017-11-26 2017-11-26 2017-11-26 100051945 ENSECAG00000023723 "F6YYD8" diff --git a/src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt b/src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt new file mode 100644 index 000000000..48452ff20 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt @@ -0,0 +1,12 @@ +XB-GENE-478054 trnt1 tRNA nucleotidyl transferase, CCA-adding, 1 ENSXETG00000025091 +XB-GENE-478064 foxh1.2 forkhead box H1, gene 2 ENSXETG00000005286 +XB-GENE-478074 nr5a2 nuclear receptor subfamily 5 group A member 2 ENSXETG00000000314 +XB-GENE-478084 tbx1 T-box 1 ENSXETG00000006304 +XB-GENE-478094 nr1d1 nuclear receptor subfamily 1 group D member 1 ENSXETG00000024397 +XB-GENE-478104 nucb1 nucleobindin 1 ENSXETG00000021229 +XB-GENE-478113 nsa2 NSA2, ribosome biogenesis homolog ENSXETG00000005077 +XB-GENE-478121 csnk1a1 casein kinase 1 alpha 1 ENSXETG00000020861 +XB-GENE-478131 hoxc6 homeobox C6 ENSXETG00000023479 +XB-GENE-478141 hba1 hemoglobin subunit alpha 1 ENSXETG00000025664 +XB-GENE-940866 rtp3c receptor (chemosensory) transporter protein 3 gene C [provisional] ENSXETG00000019753 +XB-GENE-981482 or1e2l conserved hypothetical olfactory receptor, 8 of 17 ENSXETG00000026609 diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt new file mode 100644 index 000000000..3bc5d5b6b --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt @@ -0,0 +1,10 @@ +ZDB-GENE-000125-12 Df(Chr03)c1033 c1033 c1033 SO:1000029 +ZDB-GENE-000125-12 Df(Chr03)c1033 c1033 Df(LG03) SO:1000029 +ZDB-GENE-000125-12 Df(Chr03)c1033 c1033 Df(LG03)c1033 SO:1000029 +ZDB-ALT-000405-2 Df(Chr24:reck)w15 w15 Df(Chr24:reck)w15 SO:1000029 +ZDB-ALT-000405-2 Df(Chr24:reck)w15 w15 w15 SO:1000029 +ZDB-ALT-000712-2 Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra,hs6st3b,ramp1)b476 b476 b476 SO:1000029 +ZDB-ALT-000712-2 Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra,hs6st3b,ramp1)b476 b476 Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra)b476 SO:1000029 +ZDB-ALT-000712-2 Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra,hs6st3b,ramp1)b476 b476 moe SO:1000029 +ZDB-GENE-000128-18 zc1Tg zc1Tg Tg(NBT:MAPT-GFP) SO:0001218 +ZDB-GENE-000128-18 zc1Tg zc1Tg Tg(NBT:MAPT-GFP)zc1 SO:0001218 \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt new file mode 100644 index 000000000..e7f89f7c5 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt @@ -0,0 +1,10 @@ +ZDB-GENE-000112-47 SO:0001217 ppardb ENSDARG00000009473 +ZDB-GENE-000125-12 SO:0001217 igfbp2a ENSDARG00000052470 +ZDB-GENE-000125-4 SO:0001217 dlc ENSDARG00000002336 +ZDB-GENE-000128-11 SO:0001217 dbx1b ENSDARG00000001859 +ZDB-GENE-000128-8 SO:0001217 dbx1a ENSDARG00000086393 +ZDB-GENE-000201-13 SO:0001217 anos1b ENSDARG00000004932 +ZDB-GENE-000201-18 SO:0001217 pbx4 ENSDARG00000052150 +ZDB-GENE-000201-9 SO:0001217 anos1a ENSDARG00000012896 +ZDB-GENE-000208-13 SO:0001217 crestin ENSDARG00000105570 +ZDB-GENE-000208-17 SO:0001217 calr3a ENSDARG00000103979 diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt new file mode 100644 index 000000000..a6bda8e48 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt @@ -0,0 +1,10 @@ +ZDB-GENE-000112-47 SO:0001217 ppardb NP_571543 +ZDB-GENE-000112-47 SO:0001217 ppardb XP_005167044 +ZDB-GENE-000112-47 SO:0001217 ppardb XM_009303927 +ZDB-GENE-000112-47 SO:0001217 ppardb XM_005166987 +ZDB-GENE-000112-47 SO:0001217 ppardb XP_009302202 +ZDB-GENE-000112-47 SO:0001217 ppardb NM_131468 +ZDB-GENE-000112-47 SO:0001217 ppardb XP_009302203 +ZDB-GENE-000112-47 SO:0001217 ppardb XM_009303928 +ZDB-GENE-000201-96 SO:0001217 igfbp2a NP_571533 +ZDB-GENE-000201-96 SO:0001217 igfbp2a NM_131458 diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt new file mode 100644 index 000000000..f41aba0ab --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt @@ -0,0 +1,10 @@ +ZDB-GENE-000112-47 SO:0001217 ppardb A9C4A5 +ZDB-GENE-000125-12 SO:0001217 igfbp2a Q9PTH3 +ZDB-GENE-000125-404 SO:0001217 dlc A4JYS0 +ZDB-GENE-000125-4 SO:0001217 dlc Q9IAT6 +ZDB-GENE-000128-11 SO:0001217 dbx1b B3DG51 +ZDB-GENE-000128-11 SO:0001217 dbx1b Q9PTU0 +ZDB-GENE-000128-13 SO:0001217 dbx2 A0A8M9PP76 +ZDB-GENE-000128-18 SO:0001217 dbx1a B2GNV2 +ZDB-GENE-000128-18 SO:0001217 dbx1a Q9PTU1 +ZDB-GENE-000201-9 SO:0001217 anos1b Q1MT36 diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt b/src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt new file mode 100644 index 000000000..89dc75d45 --- /dev/null +++ b/src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt @@ -0,0 +1,9 @@ +ZDB-GENE-030131-3003 HNF1 homeobox Bb hnf1bb 21 ZDB-REFCROSS-990707-1 +ZDB-GENE-030131-1077 hepatocyte nuclear factor 4, alpha hnf4a 23 ZDB-REFCROSS-000320-1 +ZDB-GENE-040718-488 WD repeat domain, phosphoinositide interacting 2 wipi2 0 +ZDB-GENE-070117-2473 wirbel wir 0 +ZDB-GENE-000710-5 WITHDRAWN:cripto WITHDRAWN:cripto 0 +ZDB-GENE-030516-5 WITHDRAWN:sb:cb476 WITHDRAWN:sb:cb476 0 +ZDB-GENE-030131-8698 WITHDRAWN:wu:fa94g04 WITHDRAWN:wu:fa94g04 0 +ZDB-GENE-070117-2162 lawrence welk wlk 0 +ZDB-GENE-040426-2161 wntless Wnt ligand secretion mediator wls 2 ZDB-REFCROSS-000320-1 diff --git a/src/python/test/xrefs/parsers/test_arrayexpress_parser.py b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py new file mode 100644 index 000000000..db0379e08 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py @@ -0,0 +1,110 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable +from types import SimpleNamespace + +from ensembl.production.xrefs.parsers.ArrayExpressParser import ArrayExpressParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link + +# Constants +SOURCE_ID_ARRAYEXPRESS = 1 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create an ArrayExpressParser instance +@pytest.fixture +def arrayexpress_parser() -> ArrayExpressParser: + return ArrayExpressParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(arrayexpress_parser: ArrayExpressParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = arrayexpress_parser.run( + { + "source_id": SOURCE_ID_ARRAYEXPRESS, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing ArrayExpress data" + assert ( + f"Added {expected_xrefs} DIRECT xrefs" in result_message + ), f"{prefix}Expected 'Added {expected_xrefs} DIRECT xrefs' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id and species_id +def test_arrayexpress_no_source_id(arrayexpress_parser: ArrayExpressParser, test_no_source_id: Callable[[ArrayExpressParser, int], None]) -> None: + test_no_source_id(arrayexpress_parser, SPECIES_ID_HUMAN) + +def test_arrayexpress_no_species_id(arrayexpress_parser: ArrayExpressParser, test_no_species_id: Callable[[ArrayExpressParser, int], None]) -> None: + test_no_species_id(arrayexpress_parser, SOURCE_ID_ARRAYEXPRESS) + +# Test case to check if parsing is skipped when no species name can be found +def test_no_species_name(mock_xref_dbi: DBConnection, arrayexpress_parser: ArrayExpressParser) -> None: + result_code, result_message = arrayexpress_parser.run( + { + "source_id": SOURCE_ID_ARRAYEXPRESS, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing ArrayExpress data" + assert ( + "Skipped. Could not find species ID to name mapping" in result_message + ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'" + +# Test case to check if an error is raised when no ArrayExpress database is provided +def test_no_arrayexpress_db(arrayexpress_parser: ArrayExpressParser) -> None: + arrayexpress_parser.get_arrayexpress_db_url = MagicMock(return_value=None) + + with pytest.raises( + AttributeError, match="Could not find ArrayExpress DB. Missing or unsupported project value." + ): + arrayexpress_parser.run( + { + "source_id": SOURCE_ID_ARRAYEXPRESS, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid ArrayExpress data +def test_successful_parsing(mock_xref_dbi: DBConnection, arrayexpress_parser: ArrayExpressParser) -> None: + # Mock all needed methods + arrayexpress_parser.get_arrayexpress_db_url = MagicMock(return_value="mock_arrayexpress_db_url") + arrayexpress_data = [ + {"stable_id": "ENSG00000139618"}, + {"stable_id": "ENSG00000157764"}, + {"stable_id": "ENSG00000198786"}, + {"stable_id": "ENSG00000248378"}, + {"stable_id": "ENSG00000248379"}, + ] + arrayexpress_data_obj = [SimpleNamespace(**item) for item in arrayexpress_data] + arrayexpress_parser.get_arrayexpress_data = MagicMock(return_value=arrayexpress_data_obj) + + # Run and validate parsing for ArrayExpress data + run_and_validate_parsing(arrayexpress_parser, mock_xref_dbi, 5) + + # Check the row counts in the xref and gene_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_ARRAYEXPRESS}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 5) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "ENSG00000139618", "ENSG00000139618") + check_direct_xref_link(mock_xref_dbi, "gene", "ENSG00000157764", "ENSG00000157764") + check_direct_xref_link(mock_xref_dbi, "gene", "ENSG00000198786", "ENSG00000198786") + + # Run and validate re-parsing for ArrayExpress data + run_and_validate_parsing(arrayexpress_parser, mock_xref_dbi, 5, "Re-parsing: ") + + # Check the row counts in the xref and gene_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_ARRAYEXPRESS}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 5) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_ccds_parser.py b/src/python/test/xrefs/parsers/test_ccds_parser.py new file mode 100644 index 000000000..1f7fe93e9 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_ccds_parser.py @@ -0,0 +1,91 @@ +import pytest +from unittest.mock import MagicMock, patch +from typing import Callable +from types import SimpleNamespace + +from ensembl.production.xrefs.parsers.CCDSParser import CCDSParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link + +# Constants +SOURCE_ID_CCDS = 1 +SPECIES_ID_HUMAN = 9606 + +# Fixture to create a CCDSParser instance +@pytest.fixture +def ccds_parser() -> CCDSParser: + return CCDSParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(ccds_parser: CCDSParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_direct_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = ccds_parser.run( + { + "source_id": SOURCE_ID_CCDS, + "species_id": SPECIES_ID_HUMAN, + "dba": "mock_ccds_db_url", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing CCDS data" + assert ( + f"Parsed CCDS identifiers, added {expected_xrefs} xrefs and {expected_direct_xrefs} direct_xrefs" in result_message + ), f"{prefix}Expected 'Parsed CCDS identifiers, added {expected_xrefs} xrefs and {expected_direct_xrefs} direct_xrefs' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id and species_id +def test_ccds_no_source_id(ccds_parser: CCDSParser, test_no_source_id: Callable[[CCDSParser, int], None]) -> None: + test_no_source_id(ccds_parser, SPECIES_ID_HUMAN) + +def test_ccds_no_species_id(ccds_parser: CCDSParser, test_no_species_id: Callable[[CCDSParser, int], None]) -> None: + test_no_species_id(ccds_parser, SOURCE_ID_CCDS) + +# Test case to check if an error is raised when no CCDS database is provided +def test_no_ccds_db(ccds_parser: CCDSParser) -> None: + result_code, result_message = ccds_parser.run( + { + "source_id": SOURCE_ID_CCDS, + "species_id": SPECIES_ID_HUMAN, + "xref_dbi": MagicMock(), + } + ) + + assert result_code == 1, f"Errors when parsing CCDS data" + assert ( + "Could not find CCDS DB." in result_message + ), f"Expected 'Could not find CCDS DB.' in result_message, but got: '{result_message}'" + +# Test case to check successful parsing of valid CCDS data +def test_successful_parsing(mock_xref_dbi: DBConnection, ccds_parser: CCDSParser) -> None: + # Mock all needed methods + ccds_data = [ + {"stable_id": "CCDS2.2", "dbprimary_acc": "ENST00000342066"}, + {"stable_id": "CCDS3.1", "dbprimary_acc": "ENST00000327044"}, + {"stable_id": "CCDS4.1", "dbprimary_acc": "ENST00000379410"}, + {"stable_id": "CCDS5.1", "dbprimary_acc": "ENST00000379410"}, + {"stable_id": "CCDS7.2", "dbprimary_acc": "ENST00000421241"}, + {"stable_id": "CCDS7.2", "dbprimary_acc": "ENST00000379319"}, + ] + ccds_data_obj = [SimpleNamespace(**item) for item in ccds_data] + ccds_parser.get_ccds_data = MagicMock(return_value=ccds_data_obj) + + # Run and validate parsing for ArrayExpress data + run_and_validate_parsing(ccds_parser, mock_xref_dbi, 5, 6) + + # Check the row counts in the xref and transcript_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}") + check_row_count(mock_xref_dbi, "transcript_direct_xref", 6) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "transcript", "ENST00000327044", "CCDS3.1") + check_direct_xref_link(mock_xref_dbi, "transcript", "ENST00000421241", "CCDS7.2") + check_direct_xref_link(mock_xref_dbi, "transcript", "ENST00000379319", "CCDS7.2") + + # Run and validate re-parsing for ArrayExpress data + run_and_validate_parsing(ccds_parser, mock_xref_dbi, 5, 6, "Re-parsing: ") + + # Check the row counts in the xref and transcript_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}") + check_row_count(mock_xref_dbi, "transcript_direct_xref", 6) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_dbass_parser.py b/src/python/test/xrefs/parsers/test_dbass_parser.py new file mode 100644 index 000000000..c6ec23967 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_dbass_parser.py @@ -0,0 +1,147 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.DBASSParser import DBASSParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_synonym, check_direct_xref_link + +# Constants +SOURCE_ID_DBASS3 = 1 +SOURCE_ID_DBASS5 = 2 +SPECIES_ID_HUMAN = 9606 +EXPECTED_NUMBER_OF_COLUMNS = 23 + +# Fixture to create a DBASSParser instance +@pytest.fixture +def dbass_parser() -> DBASSParser: + return DBASSParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(dbass_parser: DBASSParser, mock_xref_dbi: DBConnection, source_id: int, file: str, expected_direct_xrefs: int, expected_skipped_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = dbass_parser.run( + { + "source_id": source_id, + "species_id": SPECIES_ID_HUMAN, + "file": f"parsers/flatfiles/{file}.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing {file.upper()} data" + assert ( + f"{expected_direct_xrefs} direct xrefs successfully processed" in result_message + ), f"{prefix}Expected '{expected_direct_xrefs} direct xrefs successfully processed' in result_message, but got: '{result_message}'" + assert ( + f"Skipped {expected_skipped_xrefs} unmapped xrefs" in result_message + ), f"{prefix}Expected 'Skipped {expected_skipped_xrefs} unmapped xrefs' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_dbass_no_source_id(dbass_parser: DBASSParser, test_no_source_id: Callable[[DBASSParser, int], None]) -> None: + test_no_source_id(dbass_parser, SPECIES_ID_HUMAN) + +def test_dbass_no_species_id(dbass_parser: DBASSParser, test_no_species_id: Callable[[DBASSParser, int], None]) -> None: + test_no_species_id(dbass_parser, SOURCE_ID_DBASS3) + +def test_dbass_no_file(dbass_parser: DBASSParser, test_no_file: Callable[[DBASSParser, int, int], None]) -> None: + test_no_file(dbass_parser, SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_dbass_file_not_found(dbass_parser: DBASSParser, test_file_not_found: Callable[[DBASSParser, int, int], None]) -> None: + test_file_not_found(dbass_parser, SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_dbass_empty_file(dbass_parser: DBASSParser, test_empty_file: Callable[[DBASSParser, str, int, int], None]) -> None: + test_empty_file(dbass_parser, 'DBASS', SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the header has insufficient columns +def test_insufficient_header_columns(dbass_parser: DBASSParser) -> None: + mock_file = io.StringIO("Id,GeneSymbol,GeneFullName,EnsemblReference\n") + dbass_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in DBASS file"): + dbass_parser.run( + { + "source_id": SOURCE_ID_DBASS3, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Parametrized test case to check if an error is raised for various malformed headers +@pytest.mark.parametrize( + "header", [ + ("GeneId,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"), + ("Id,GeneSymbols,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"), + ("Id,GeneSymbol,GeneFullName,EnsemblRef,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"), + ], + ids=["first column", "second column", "fourth column"], +) +def test_malformed_headers(dbass_parser: DBASSParser, header: str) -> None: + mock_file = io.StringIO(header) + dbass_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in DBASS file"): + dbass_parser.run( + { + "source_id": SOURCE_ID_DBASS3, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file has insufficient columns +def test_insufficient_columns(dbass_parser: DBASSParser) -> None: + mock_file = io.StringIO() + mock_file.write( + "Id,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n" + ) + mock_file.write("1,GNAS complex locus,ENSG00000087460,Hereditary osteodystrophy,103580\n") + mock_file.seek(0) + + dbass_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="has an incorrect number of columns"): + dbass_parser.run( + { + "source_id": SOURCE_ID_DBASS3, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing +def test_successful_parsing(mock_xref_dbi: DBConnection, dbass_parser: DBASSParser) -> None: + # Run and validate parsing for DBASS3 and DBASS5 files + run_and_validate_parsing(dbass_parser, mock_xref_dbi, SOURCE_ID_DBASS3, "dbass3", 6, 1) + run_and_validate_parsing(dbass_parser, mock_xref_dbi, SOURCE_ID_DBASS5, "dbass5", 6, 0) + + # Check the row counts in the xref, gene_direct_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS3}") + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS5}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 12) + check_row_count(mock_xref_dbi, "synonym", 3) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "2", "ENSG00000130164") + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "2", SOURCE_ID_DBASS3, "LDLT") + check_synonym(mock_xref_dbi, "3", SOURCE_ID_DBASS3, "LDLT") + check_synonym(mock_xref_dbi, "4", SOURCE_ID_DBASS3, "LDLT") + + # Run and validate re-parsing for DBASS3 file + run_and_validate_parsing(dbass_parser, mock_xref_dbi, SOURCE_ID_DBASS3, "dbass3", 6, 1, "Re-parsing: ") + + # Check the row counts in the xref, gene_direct_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS3}") + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS5}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 12) + check_row_count(mock_xref_dbi, "synonym", 3) diff --git a/src/python/test/xrefs/parsers/test_entrezgene_parser.py b/src/python/test/xrefs/parsers/test_entrezgene_parser.py new file mode 100644 index 000000000..f0d31f8ae --- /dev/null +++ b/src/python/test/xrefs/parsers/test_entrezgene_parser.py @@ -0,0 +1,157 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.EntrezGeneParser import EntrezGeneParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_synonym + +# Constants +SOURCE_ID_ENTREZGENE = 1 +SOURCE_ID_WIKIGENE = 2 +SPECIES_ID_HUMAN = 9606 +EXPECTED_NUMBER_OF_COLUMNS = 16 + +# Fixture to create an EntrezGeneParser instance +@pytest.fixture +def entrezgene_parser() -> EntrezGeneParser: + return EntrezGeneParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(entrezgene_parser: EntrezGeneParser, mock_xref_dbi: DBConnection, expected_entrez_xrefs: int, expected_wiki_xrefs: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = entrezgene_parser.run( + { + "source_id": SOURCE_ID_ENTREZGENE, + "species_id": SPECIES_ID_HUMAN, + "file": "parsers/flatfiles/entrezgene.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing EntrezGene data" + assert ( + f"{expected_entrez_xrefs} EntrezGene Xrefs and {expected_wiki_xrefs} WikiGene Xrefs added with {expected_synonyms} synonyms" in result_message + ), f"{prefix}Expected '{expected_entrez_xrefs} EntrezGene Xrefs and {expected_wiki_xrefs} WikiGene Xrefs added with {expected_synonyms} synonyms' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_entrezgene_no_source_id(entrezgene_parser: EntrezGeneParser, test_no_source_id: Callable[[EntrezGeneParser, int], None]) -> None: + test_no_source_id(entrezgene_parser, SPECIES_ID_HUMAN) + +def test_entrezgene_no_species_id(entrezgene_parser: EntrezGeneParser, test_no_species_id: Callable[[EntrezGeneParser, int], None]) -> None: + test_no_species_id(entrezgene_parser, SOURCE_ID_ENTREZGENE) + +def test_entrezgene_no_file(entrezgene_parser: EntrezGeneParser, test_no_file: Callable[[EntrezGeneParser, int, int], None]) -> None: + test_no_file(entrezgene_parser, SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_entrezgene_file_not_found(entrezgene_parser: EntrezGeneParser, test_file_not_found: Callable[[EntrezGeneParser, int, int], None]) -> None: + test_file_not_found(entrezgene_parser, SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_entrezgene_empty_file(entrezgene_parser: EntrezGeneParser, test_empty_file: Callable[[EntrezGeneParser, str, int, int], None]) -> None: + test_empty_file(entrezgene_parser, 'EntrezGene', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the header has insufficient columns +def test_insufficient_header_columns(entrezgene_parser: EntrezGeneParser) -> None: + mock_file = io.StringIO("#tax_id\tgeneid\tsymbol\n") + entrezgene_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in EntrezGene file"): + entrezgene_parser.run( + { + "source_id": SOURCE_ID_ENTREZGENE, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Parametrized test case to check if an error is raised for various malformed headers +@pytest.mark.parametrize( + "header", [ + ("tax_ids\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneIDs\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbols\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocuTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSyn\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdb_Xrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchr\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmapp_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription:\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype__of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomen_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tstatus\tOther_designations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tdesignations\tModification_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tMod_date\tFeature_type\n"), + ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeaturetype\n"), + ], + ids=[ + "tax_id column", "gene_id column", "symbol column", "locus_tag column", "synonyms column", + "db_xrefs column", "chromosome column", "map_location column", "description column", + "type_of_gene column", "symbol_nomen_auth column", "full_name column", "nomen_status column", + "other_designations column", "mofification_date column", "feature_type column" + ], +) +def test_malformed_headers(entrezgene_parser: EntrezGeneParser, header: str) -> None: + mock_file = io.StringIO(header) + entrezgene_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in EntrezGene file"): + entrezgene_parser.run( + { + "source_id": SOURCE_ID_ENTREZGENE, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file has insufficient columns +def test_insufficient_columns(entrezgene_parser: EntrezGeneParser) -> None: + mock_file = io.StringIO() + mock_file.write("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n") + mock_file.write("9606\t1\tA1BG\t-\tA1B|ABG|GAB|HYST2477\n") + mock_file.seek(0) + + entrezgene_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="has an incorrect number of columns"): + entrezgene_parser.run( + { + "source_id": SOURCE_ID_ENTREZGENE, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid EntrezGene data +def test_successful_parsing(mock_xref_dbi: DBConnection, entrezgene_parser: EntrezGeneParser) -> None: + entrezgene_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_WIKIGENE) + + # Run and validate parsing for EntrezGene file + run_and_validate_parsing(entrezgene_parser, mock_xref_dbi, 10, 10, 26) + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}") + check_row_count(mock_xref_dbi, "synonym", 26) + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "A2MD") + check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "CPAMD5") + check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "FWP007") + check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "S863-7") + + # Run and validate parsing for EntrezGene file + run_and_validate_parsing(entrezgene_parser, mock_xref_dbi, 10, 10, 26, "Re-parsing: ") + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}") + check_row_count(mock_xref_dbi, "synonym", 26) diff --git a/src/python/test/xrefs/parsers/test_hgnc_parser.py b/src/python/test/xrefs/parsers/test_hgnc_parser.py new file mode 100644 index 000000000..7f920779f --- /dev/null +++ b/src/python/test/xrefs/parsers/test_hgnc_parser.py @@ -0,0 +1,182 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable, Dict + +from ensembl.production.xrefs.parsers.HGNCParser import HGNCParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_synonym + +# Constants +SOURCE_ID_HGNC = 1 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" +SOURCE_ID_CCDS = 2 +SOURCE_ID_ENTREZGENE = 3 +SOURCE_ID_REFSEQ = 4 +SOURCE_ID_ENSEMBL_MANUAL = 5 +SOURCE_ID_LRG = 6 +SOURCE_ID_GENECARDS = 7 +SOURCE_ID_DESC_ONLY = 8 + +# Fixture to create an HGNCParser instance +@pytest.fixture +def hgnc_parser() -> HGNCParser: + return HGNCParser(True) + +# Mock for get_source_id_for_source_name +def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection, desc: str = None) -> int: + source_mapping = { + "ccds": SOURCE_ID_CCDS, + "entrezgene_manual": SOURCE_ID_ENTREZGENE, + "refseq_manual": SOURCE_ID_REFSEQ, + "ensembl_manual": SOURCE_ID_ENSEMBL_MANUAL, + "lrg_hgnc_notransfer": SOURCE_ID_LRG, + "genecards": SOURCE_ID_GENECARDS, + "desc_only": SOURCE_ID_DESC_ONLY, + } + + if source_name == "HGNC" and desc: + return source_mapping.get(desc, SOURCE_ID_HGNC) + + return source_mapping.get(source_name.lower(), SOURCE_ID_HGNC) + +# Function to run and validate the parsing process +def run_and_validate_parsing(hgnc_parser: HGNCParser, mock_xref_dbi: DBConnection, expected_xrefs: Dict[str, int], expected_mismatch: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = hgnc_parser.run( + { + "source_id": SOURCE_ID_HGNC, + "species_id": SPECIES_ID_HUMAN, + "file": "parsers/flatfiles/hgnc.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing HGNC data" + for count_type, count in expected_xrefs.items(): + assert f"{count_type}\t{count}" in result_message, f"{prefix}Expected '{count_type}\t{count}' in result_meesgae, but got: '{result_message}'" + + assert ( + f"{expected_synonyms} synonyms added" in result_message + ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_mismatch} HGNC ids could not be associated in xrefs" in result_message + ), f"{prefix}Expected '{expected_mismatch} HGNC ids could not be associated in xrefs' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_hgnc_no_source_id(hgnc_parser: HGNCParser, test_no_source_id: Callable[[HGNCParser, int], None]) -> None: + test_no_source_id(hgnc_parser, SPECIES_ID_HUMAN) + +def test_hgnc_no_species_id(hgnc_parser: HGNCParser, test_no_species_id: Callable[[HGNCParser, int], None]) -> None: + test_no_species_id(hgnc_parser, SOURCE_ID_HGNC) + +def test_hgnc_no_file(hgnc_parser: HGNCParser, test_no_file: Callable[[HGNCParser, int, int], None]) -> None: + test_no_file(hgnc_parser, SOURCE_ID_HGNC, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when no CCDS database is provided +def test_no_ccds_db(hgnc_parser: HGNCParser) -> None: + with pytest.raises( + AttributeError, match="No ensembl ccds database provided" + ): + hgnc_parser.run( + { + "source_id": SOURCE_ID_HGNC, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file is not found +def test_hgnc_file_not_found(hgnc_parser: HGNCParser, test_file_not_found: Callable[[HGNCParser, int, int], None]) -> None: + hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url") + test_file_not_found(hgnc_parser, SOURCE_ID_HGNC, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_hgnc_empty_file(hgnc_parser: HGNCParser, test_empty_file: Callable[[HGNCParser, str, int, int], None]) -> None: + hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url") + test_empty_file(hgnc_parser, 'HGNC', SOURCE_ID_HGNC, SPECIES_ID_HUMAN) + +# Test case to check successful parsing of valid HGNC data without existing ccds, refseq, or entrezgene xrefs +def test_successful_parsing_without_existing_xrefs(mock_xref_dbi: DBConnection, hgnc_parser: HGNCParser) -> None: + # Mock all needed methods + hgnc_parser.get_source_name_for_source_id = MagicMock(return_value="HGNC") + hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url") + hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={}) + hgnc_parser.get_valid_codes = MagicMock(return_value={}) + hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={}) + + # Run and validate parsing for HGNC file + expected_counts = {"ccds": 0, "entrezgene_manual": 0, "refseq_manual": 0, "ensembl_manual": 19, "lrg": 2, "genecards": 19} + run_and_validate_parsing(hgnc_parser, mock_xref_dbi, expected_counts, 1, 78) + + # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENSEMBL_MANUAL}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_LRG}") + check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_GENECARDS}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_DESC_ONLY}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 21) + check_row_count(mock_xref_dbi, "dependent_xref", 19) + check_row_count(mock_xref_dbi, "synonym", 78) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "HGNC:5", "ENSG00000121410") + +# Test case to check successful parsing of valid HGNC data with existing ccds, refseq, and entrezgene xrefs +def test_successful_parsing_with_existing_xrefs(mock_xref_dbi: DBConnection, hgnc_parser: HGNCParser) -> None: + # Mock all needed methods + hgnc_parser.get_source_name_for_source_id = MagicMock(return_value="HGNC") + hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url") + hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={"CCDS12976": "CCDS12976", "CCDS8856": "CCDS8856", "CCDS53797": "CCDS53797"}) + hgnc_parser.get_valid_codes = MagicMock(return_value={"NM_130786": [12], "NR_026971": [34, 56], "NR_015380": [78], "NM_001088": [90]}) + hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={"503538": 123, "441376": 456, "51146": 789}) + + # Run and validate parsing for HGNC file + expected_counts = {"ccds": 3, "entrezgene_manual": 3, "refseq_manual": 5, "ensembl_manual": 19, "lrg": 2, "genecards": 19} + run_and_validate_parsing(hgnc_parser, mock_xref_dbi, expected_counts, 1, 90) + + # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}") + check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENSEMBL_MANUAL}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_LRG}") + check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_GENECARDS}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_REFSEQ}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_DESC_ONLY}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 24) + check_row_count(mock_xref_dbi, "dependent_xref", 27) + check_row_count(mock_xref_dbi, "synonym", 90) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "HGNC:13666", "CCDS8856") + check_direct_xref_link(mock_xref_dbi, "gene", "HGNC:20", "LRG_359") + + # Check the link between an xref and dependent_xref + check_dependent_xref_link(mock_xref_dbi, "HGNC:5", 12) + check_dependent_xref_link(mock_xref_dbi, "HGNC:27057", 56) + check_dependent_xref_link(mock_xref_dbi, "HGNC:17968", 789) + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "HGNC:8", SOURCE_ID_ENSEMBL_MANUAL, "A2MP") + check_synonym(mock_xref_dbi, "HGNC:37133", SOURCE_ID_ENTREZGENE, "FLJ23569") + check_synonym(mock_xref_dbi, "HGNC:37133", SOURCE_ID_REFSEQ, "FLJ23569") + + # Run and validate re-parsing for HGNC file + run_and_validate_parsing(hgnc_parser, mock_xref_dbi, expected_counts, 1, 90, "Re-parsing: ") + + # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}") + check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENSEMBL_MANUAL}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_LRG}") + check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_GENECARDS}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_REFSEQ}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_DESC_ONLY}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 24) + check_row_count(mock_xref_dbi, "dependent_xref", 27) + check_row_count(mock_xref_dbi, "synonym", 90) + diff --git a/src/python/test/xrefs/parsers/test_hpa_parser.py b/src/python/test/xrefs/parsers/test_hpa_parser.py new file mode 100644 index 000000000..838a3756e --- /dev/null +++ b/src/python/test/xrefs/parsers/test_hpa_parser.py @@ -0,0 +1,132 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.HPAParser import HPAParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link + +# Constants +SOURCE_ID_HPA = 1 +SPECIES_ID_HUMAN = 9606 +EXPECTED_NUMBER_OF_COLUMNS = 4 + +# Fixture to create an HPAParser instance +@pytest.fixture +def hpa_parser() -> HPAParser: + return HPAParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(hpa_parser: HPAParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = hpa_parser.run( + { + "source_id": SOURCE_ID_HPA, + "species_id": SPECIES_ID_HUMAN, + "file": "parsers/flatfiles/hpa.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing HPA data" + assert ( + f"{expected_xrefs} direct xrefs successfully parsed" in result_message + ), f"{prefix}Expected '{expected_xrefs} direct xrefs successfully parsed' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_hpa_no_source_id(hpa_parser: HPAParser, test_no_source_id: Callable[[HPAParser, int], None]) -> None: + test_no_source_id(hpa_parser, SPECIES_ID_HUMAN) + +def test_hpa_no_species_id(hpa_parser: HPAParser, test_no_species_id: Callable[[HPAParser, int], None]) -> None: + test_no_species_id(hpa_parser, SOURCE_ID_HPA) + +def test_hpa_no_file(hpa_parser: HPAParser, test_no_file: Callable[[HPAParser, int, int], None]) -> None: + test_no_file(hpa_parser, SOURCE_ID_HPA, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_hpa_file_not_found(hpa_parser: HPAParser, test_file_not_found: Callable[[HPAParser, int, int], None]) -> None: + test_file_not_found(hpa_parser, SOURCE_ID_HPA, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_hpa_empty_file(hpa_parser: HPAParser, test_empty_file: Callable[[HPAParser, str, int, int], None]) -> None: + test_empty_file(hpa_parser, 'HPA', SOURCE_ID_HPA, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the header has insufficient columns +def test_insufficient_header_columns(hpa_parser: HPAParser) -> None: + mock_file = io.StringIO("antibody,antibody_id\n") + hpa_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in HPA file"): + hpa_parser.run( + { + "source_id": SOURCE_ID_HPA, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Parametrized test case to check if an error is raised for various malformed headers +@pytest.mark.parametrize( + "header", [ + ("Antibodies,antibody_id,ensembl_peptide_id,link\n"), + ("Antibody,antibodyId,ensembl_peptide_id,link\n"), + ("Antibody,antibody_id,ensembl peptide id,link\n"), + ("Antibody,antibody_id,ensembl_peptide_id,links\n") + ], + ids=["antibody column", "antibody_id column", "ensembl_id column", "link column"], +) +def test_malformed_headers(hpa_parser: HPAParser, header: str) -> None: + mock_file = io.StringIO(header) + hpa_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in HPA file"): + hpa_parser.run( + { + "source_id": SOURCE_ID_HPA, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file has insufficient columns +def test_insufficient_columns(hpa_parser: HPAParser) -> None: + mock_file = io.StringIO() + mock_file.write("Antibody,antibody_id,ensembl_peptide_id,link\n") + mock_file.write("CAB000001,1,ENSP00000363822\n") + mock_file.seek(0) + + hpa_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="has an incorrect number of columns"): + hpa_parser.run( + { + "source_id": SOURCE_ID_HPA, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid HPA data +def test_successful_parsing(mock_xref_dbi: DBConnection, hpa_parser: HPAParser) -> None: + # Run and validate parsing for HPA file + run_and_validate_parsing(hpa_parser, mock_xref_dbi, 10) + + # Check the row counts in the xref and direct_xref tables + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_HPA}") + check_row_count(mock_xref_dbi, "translation_direct_xref", 10) + + # Check the link between an xref and translation_direct_xref + check_direct_xref_link(mock_xref_dbi, "translation", "2", "ENSP00000224784") + + # Run and validate re-parsing of the HPA file + run_and_validate_parsing(hpa_parser, mock_xref_dbi, 10, "Re-parsing: ") + + # Re-check the row counts in the xref and direct_xref tables after re-parsing + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_HPA}") + check_row_count(mock_xref_dbi, "translation_direct_xref", 10) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_jgi_protein_parser.py b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py new file mode 100644 index 000000000..666e6fa95 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py @@ -0,0 +1,61 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.JGI_ProteinParser import JGI_ProteinParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count + +# Constants +SOURCE_ID_JGI = 1 +SPECIES_ID_C_INTESTINALIS = 7719 + +# Fixture to create a JGI_ProteinParser instance +@pytest.fixture +def jgi_protein_parser() -> JGI_ProteinParser: + return JGI_ProteinParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(jgi_protein_parser: JGI_ProteinParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = jgi_protein_parser.run( + { + "source_id": SOURCE_ID_JGI, + "species_id": SPECIES_ID_C_INTESTINALIS, + "file": "parsers/flatfiles/jgi_protein.fasta", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing JGI data" + assert f"{expected_xrefs} JGI_ xrefs successfully parsed" in result_message, f"{prefix}Expected '{expected_xrefs} JGI_ xrefs successfully parsed' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_jgi_no_source_id(jgi_protein_parser: JGI_ProteinParser, test_no_source_id: Callable[[JGI_ProteinParser, int], None]) -> None: + test_no_source_id(jgi_protein_parser, SPECIES_ID_C_INTESTINALIS) + +def test_jgi_no_species_id(jgi_protein_parser: JGI_ProteinParser, test_no_species_id: Callable[[JGI_ProteinParser, int], None]) -> None: + test_no_species_id(jgi_protein_parser, SOURCE_ID_JGI) + +def test_jgi_no_file(jgi_protein_parser: JGI_ProteinParser, test_no_file: Callable[[JGI_ProteinParser, int, int], None]) -> None: + test_no_file(jgi_protein_parser, SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) + +# Test case to check if an error is raised when the file is not found +def test_jgi_file_not_found(jgi_protein_parser: JGI_ProteinParser, test_file_not_found: Callable[[JGI_ProteinParser, int, int], None]) -> None: + test_file_not_found(jgi_protein_parser, SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) + +# Test case to check if an error is raised when the file is empty +def test_jgi_empty_file(jgi_protein_parser: JGI_ProteinParser, test_empty_file: Callable[[JGI_ProteinParser, str, int, int], None]) -> None: + test_empty_file(jgi_protein_parser, 'JGIProtein', SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) + +# Test case to check successful parsing +def test_successful_parsing(mock_xref_dbi: DBConnection, jgi_protein_parser: JGI_ProteinParser) -> None: + # Run and validate parsing for JGI Protein file + run_and_validate_parsing(jgi_protein_parser, mock_xref_dbi, 9) + + # Check the row counts in the xref and primary_xref tables + check_row_count(mock_xref_dbi, "xref", 9, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_JGI}") + check_row_count(mock_xref_dbi, "primary_xref", 9) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_mgi_desc_parser.py b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py new file mode 100644 index 000000000..02b46352b --- /dev/null +++ b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py @@ -0,0 +1,148 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.MGIDescParser import MGIDescParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_synonym + +# Constants +SOURCE_ID_MGI_DESC = 1 +SPECIES_ID_MOUSE = 10090 +EXPECTED_NUMBER_OF_COLUMNS = 12 + +# Fixture to create an MGIDescParser instance +@pytest.fixture +def mgi_desc_parser() -> MGIDescParser: + return MGIDescParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(mgi_desc_parser: MGIDescParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = mgi_desc_parser.run( + { + "source_id": SOURCE_ID_MGI_DESC, + "species_id": SPECIES_ID_MOUSE, + "file": "parsers/flatfiles/mgi_desc.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing MGI Description data" + assert ( + f"{expected_xrefs} MGI Description Xrefs added" in result_message + ), f"{prefix}Expected '{expected_xrefs} MGI Description Xrefs added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_synonyms} synonyms added" in result_message + ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_mgi_desc_no_source_id(mgi_desc_parser: MGIDescParser, test_no_source_id: Callable[[MGIDescParser, int], None]) -> None: + test_no_source_id(mgi_desc_parser, SPECIES_ID_MOUSE) + +def test_mgi_desc_no_species_id(mgi_desc_parser: MGIDescParser, test_no_species_id: Callable[[MGIDescParser, int], None]) -> None: + test_no_species_id(mgi_desc_parser, SOURCE_ID_MGI_DESC) + +def test_mgi_desc_no_file(mgi_desc_parser: MGIDescParser, test_no_file: Callable[[MGIDescParser, int, int], None]) -> None: + test_no_file(mgi_desc_parser, SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) + +# Test case to check if an error is raised when the file is not found +def test_mgi_desc_file_not_found(mgi_desc_parser: MGIDescParser, test_file_not_found: Callable[[MGIDescParser, int, int], None]) -> None: + test_file_not_found(mgi_desc_parser, SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) + +# Test case to check if an error is raised when the file is empty +def test_mgi_desc_empty_file(mgi_desc_parser: MGIDescParser, test_empty_file: Callable[[MGIDescParser, str, int, int], None]) -> None: + test_empty_file(mgi_desc_parser, 'MGI_desc', SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) + +# Test case to check if an error is raised when the header has insufficient columns +def test_insufficient_header_columns(mgi_desc_parser: MGIDescParser) -> None: + mock_file = io.StringIO("mgi accession id\tchr\tcm position\n") + mgi_desc_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in MGI_desc file"): + mgi_desc_parser.run( + { + "source_id": SOURCE_ID_MGI_DESC, + "species_id": SPECIES_ID_MOUSE, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Parametrized test case to check if an error is raised for various malformed headers +@pytest.mark.parametrize( + "header", [ + ("MGI_accession_ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChromosome\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Pos\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coord start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coord end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tchr strand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tSymbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tMarker Status\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tName\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker_Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Types\tMarker Synonyms (pipe-separated)\n"), + ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms\n"), + ], + ids=[ + "accession column", "chromosome column", "position column", "coord start column", + "coord end column", "strand column", "symbol column", "status column", "name column", + "marker type column", "feature type column", "synonyms column" + ], +) +def test_malformed_headers(mgi_desc_parser: MGIDescParser, header: str) -> None: + mock_file = io.StringIO(header) + mgi_desc_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in MGI_desc file"): + mgi_desc_parser.run( + { + "source_id": SOURCE_ID_MGI_DESC, + "species_id": SPECIES_ID_MOUSE, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file has insufficient columns +def test_insufficient_columns(mgi_desc_parser: MGIDescParser) -> None: + mock_file = io.StringIO() + mock_file.write("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n") + mock_file.write("MGI:1341858\t5\tsyntenic\n") + mock_file.seek(0) + + mgi_desc_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="has an incorrect number of columns"): + mgi_desc_parser.run( + { + "source_id": SOURCE_ID_MGI_DESC, + "species_id": SPECIES_ID_MOUSE, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid MGI Description data +def test_successful_parsing(mock_xref_dbi: DBConnection, mgi_desc_parser: MGIDescParser) -> None: + # Run and validate parsing for MGI Description file + run_and_validate_parsing(mgi_desc_parser, mock_xref_dbi, 10, 2) + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='MISC' AND source_id={SOURCE_ID_MGI_DESC}") + check_row_count(mock_xref_dbi, "synonym", 2) + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "MGI:1926146", SOURCE_ID_MGI_DESC, "Ecrg4") + + # Run and validate re-parsing for MGI Description file + run_and_validate_parsing(mgi_desc_parser, mock_xref_dbi, 10, 2, "Re-parsing: ") + + # Check the row counts in the xref and synonym tables again + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='MISC' AND source_id={SOURCE_ID_MGI_DESC}") + check_row_count(mock_xref_dbi, "synonym", 2) + diff --git a/src/python/test/xrefs/parsers/test_mgi_parser.py b/src/python/test/xrefs/parsers/test_mgi_parser.py new file mode 100644 index 000000000..fab933d60 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_mgi_parser.py @@ -0,0 +1,84 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.MGIParser import MGIParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_synonym, check_direct_xref_link + +# Constants +SOURCE_ID_MGI = 1 +SPECIES_ID_MOUSE = 10090 + +# Fixture to create an MGIParser instance +@pytest.fixture +def mgi_parser() -> MGIParser: + return MGIParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(mgi_parser: MGIParser, mock_xref_dbi: DBConnection, expected_direct_xrefs: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = mgi_parser.run( + { + "source_id": SOURCE_ID_MGI, + "species_id": SPECIES_ID_MOUSE, + "file": "parsers/flatfiles/mgi.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing MGI data" + assert ( + f"{expected_direct_xrefs} direct MGI xrefs added" in result_message + ), f"{prefix}Expected '{expected_direct_xrefs} direct MGI xrefs added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_synonyms} synonyms added" in result_message + ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_mgi_no_source_id(mgi_parser: MGIParser, test_no_source_id: Callable[[MGIParser, int], None]) -> None: + test_no_source_id(mgi_parser, SPECIES_ID_MOUSE) + +def test_mgi_no_species_id(mgi_parser: MGIParser, test_no_species_id: Callable[[MGIParser, int], None]) -> None: + test_no_species_id(mgi_parser, SOURCE_ID_MGI) + +def test_mgi_no_file(mgi_parser: MGIParser, test_no_file: Callable[[MGIParser, int, int], None]) -> None: + test_no_file(mgi_parser, SOURCE_ID_MGI, SPECIES_ID_MOUSE) + +# Test case to check if an error is raised when the file is not found +def test_mgi_file_not_found(mgi_parser: MGIParser, test_file_not_found: Callable[[MGIParser, int, int], None]) -> None: + test_file_not_found(mgi_parser, SOURCE_ID_MGI, SPECIES_ID_MOUSE) + +# Test case to check if an error is raised when the file is empty +def test_mgi_empty_file(mgi_parser: MGIParser, test_empty_file: Callable[[MGIParser, str, int, int], None]) -> None: + test_empty_file(mgi_parser, 'MGI', SOURCE_ID_MGI, SPECIES_ID_MOUSE) + +# Test case to check successful parsing of valid MGI data +def test_successful_parsing(mock_xref_dbi: DBConnection, mgi_parser: MGIParser) -> None: + # Mock the synonym hash to return some test synonyms + mgi_parser.get_ext_synonyms = MagicMock(return_value={"MGI:1926146": ["Ecrg4", "augurin"]}) + + # Run and validate parsing for MGI file + run_and_validate_parsing(mgi_parser, mock_xref_dbi, 10, 2) + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DIRECT' AND source_id={SOURCE_ID_MGI}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 10) + check_row_count(mock_xref_dbi, "synonym", 2) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "MGI:1914753", "ENSMUSG00000103746") + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "MGI:1926146", SOURCE_ID_MGI, "Ecrg4") + check_synonym(mock_xref_dbi, "MGI:1926146", SOURCE_ID_MGI, "augurin") + + # Run and validate re-parsing for MGI file + run_and_validate_parsing(mgi_parser, mock_xref_dbi, 10, 2, "Re-parsing: ") + + # Check the row counts in the xref and synonym tables again + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DIRECT' AND source_id={SOURCE_ID_MGI}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 10) + check_row_count(mock_xref_dbi, "synonym", 2) diff --git a/src/python/test/xrefs/parsers/test_mim2gene_parser.py b/src/python/test/xrefs/parsers/test_mim2gene_parser.py new file mode 100644 index 000000000..590c1c3bc --- /dev/null +++ b/src/python/test/xrefs/parsers/test_mim2gene_parser.py @@ -0,0 +1,250 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable +from sqlalchemy import text + +from ensembl.production.xrefs.parsers.Mim2GeneParser import Mim2GeneParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_dependent_xref_link + +# Constants +SOURCE_ID_MIM2GENE = 1 +SOURCE_ID_MIM_GENE = 2 +SOURCE_ID_MIM_MORBID = 3 +SOURCE_ID_ENTREZGENE = 4 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create a Mim2GeneParser instance +@pytest.fixture +def mim2gene_parser() -> Mim2GeneParser: + return Mim2GeneParser(True) + +# Mock for get_source_id_for_source_name +def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection) -> int: + source_mapping = { + "MIM_GENE": SOURCE_ID_MIM_GENE, + "MIM_MORBID": SOURCE_ID_MIM_MORBID, + "EntrezGene": SOURCE_ID_ENTREZGENE, + } + return source_mapping.get(source_name, SOURCE_ID_MIM2GENE) + +# Function to populate the database with MIM and EntrezGene xrefs +def populate_xref_db(mock_xref_dbi: DBConnection): + source_data = [ + [SOURCE_ID_MIM2GENE, 'MIM2GENE', 10], + [SOURCE_ID_MIM_GENE, 'MIM_GENE', 10], + [SOURCE_ID_MIM_MORBID, 'MIM_MORBID', 10], + [SOURCE_ID_ENTREZGENE, 'EntrezGene', 10], + ] + for row in source_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO source (source_id, name, ordered) + VALUES (:source_id, :name, :ordered) + """ + ), + { + "source_id": row[0], + "name": row[1], + "ordered": row[2], + } + ) + + xref_data = [ + [1, '100050', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # unmapped + [2, '100640', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # dependent + [3, '100100', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # dependent + [4, '142830', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # unmapped + [5, '142830', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # unmapped + [6, '100660', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # dependent + [7, '100300', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # via synonym + [8, '999999', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # not referenced + [9, '216', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # <- 100640 + [10, '1131', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # <- 100100 + [11, '218', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # 100660 + [12, '222222', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # not referenced <- via synonym + ] + for row in xref_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO xref (xref_id, accession, source_id, species_id, info_type) + VALUES (:xref_id, :accession, :source_id, :species_id, :info_type) + """ + ), + { + "xref_id": row[0], + "accession": row[1], + "source_id": row[2], + "species_id": row[3], + "info_type": row[4], + } + ) + + mock_xref_dbi.commit() + +# Function to run and validate the parsing process +def run_and_validate_parsing(mim2gene_parser: Mim2GeneParser, mock_xref_dbi: DBConnection, expected_entries: int, expected_missed_omim: int, expected_entrez: int, expected_missed_master: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = mim2gene_parser.run( + { + "source_id": SOURCE_ID_MIM2GENE, + "species_id": SPECIES_ID_HUMAN, + "file": "parsers/flatfiles/mim2gene.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing Mim2Gene data" + assert ( + f"Processed {expected_entries} entries" in result_message + ), f"{prefix}Expected 'Processed {expected_entries} entries' in result message, but got: '{result_message}'" + assert ( + f"{expected_missed_omim} had missing OMIM entries" in result_message + ), f"{prefix}Expected '{expected_missed_omim} had missing OMIM entries' in result message, but got: '{result_message}'" + assert ( + f"{expected_entrez} were dependent EntrezGene xrefs" in result_message + ), f"{prefix}Expected '{expected_entrez} were dependent EntrezGene xrefs' in result message, but got: '{result_message}'" + assert ( + f"{expected_missed_master} had missing master entries" in result_message + ), f"{prefix}Expected '{expected_missed_master} had missing master entries' in result message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_mim2gene_no_source_id(mim2gene_parser: Mim2GeneParser, test_no_source_id: Callable[[Mim2GeneParser, int], None]) -> None: + test_no_source_id(mim2gene_parser, SPECIES_ID_HUMAN) + +def test_mim2gene_no_species_id(mim2gene_parser: Mim2GeneParser, test_no_species_id: Callable[[Mim2GeneParser, int], None]) -> None: + test_no_species_id(mim2gene_parser, SOURCE_ID_MIM2GENE) + +def test_mim2gene_no_file(mim2gene_parser: Mim2GeneParser, test_no_file: Callable[[Mim2GeneParser, int, int], None]) -> None: + test_no_file(mim2gene_parser, SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_mim2gene_file_not_found(mim2gene_parser: Mim2GeneParser, test_file_not_found: Callable[[Mim2GeneParser, int, int], None]) -> None: + test_file_not_found(mim2gene_parser, SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_mim2gene_empty_file(mim2gene_parser: Mim2GeneParser, test_empty_file: Callable[[Mim2GeneParser, str, int, int], None]) -> None: + test_empty_file(mim2gene_parser, 'Mim2Gene', SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the required source_id is missing +def test_mim2gene_missing_required_source_id(mim2gene_parser: Mim2GeneParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[Mim2GeneParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(mim2gene_parser, mock_xref_dbi, 'MIM_GENE', SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the header has insufficient columns +def test_insufficient_header_columns(mim2gene_parser: Mim2GeneParser) -> None: + mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + mock_file = io.StringIO("#MIM number\tGeneID\ttype\tSource\tMedGenCUI\n") + mim2gene_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in Mim2Gene file"): + mim2gene_parser.run( + { + "source_id": SOURCE_ID_MIM2GENE, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Parametrized test case to check if an error is raised for various malformed headers +@pytest.mark.parametrize( + "header", [ + ("#MIM\tGeneID\ttype\tSource\tMedGenCUI\tComment\n"), + ("#MIM number\tGene_ID\ttype\tSource\tMedGenCUI\tComment\n"), + ("#MIM number\tGeneID\tTYPE\tSource\tMedGenCUI\tComment\n"), + ("#MIM number\tGeneID\ttype\tsource\tMedGenCUI\tComment\n"), + ("#MIM number\tGeneID\ttype\tSource\tMedGen\tComment\n"), + ("#MIM number\tGeneID\ttype\tSource\tMedGenCUI\tComments\n"), + ], + ids=["mim_number column", "gene_id column", "type column", "source column", "medgen_cui column", "comment column"], +) +def test_malformed_headers(mim2gene_parser: Mim2GeneParser, header: str) -> None: + mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + mock_file = io.StringIO(header) + mim2gene_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Malformed or unexpected header in Mim2Gene file"): + mim2gene_parser.run( + { + "source_id": SOURCE_ID_MIM2GENE, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file has insufficient columns +def test_insufficient_columns(mim2gene_parser: Mim2GeneParser) -> None: + mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + mock_file = io.StringIO() + mock_file.write("#MIM number\tGeneID\ttype\tSource\tMedGenCUI\tComment\n") + mock_file.write("100050\t-\tphenotype\t-\n") + mock_file.seek(0) + + mim2gene_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="has an incorrect number of columns"): + mim2gene_parser.run( + { + "source_id": SOURCE_ID_MIM2GENE, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid Mim2Gene data without existing mim or entrezgene xrefs +def test_successful_parsing_without_existing_xrefs(mock_xref_dbi: DBConnection, mim2gene_parser: Mim2GeneParser) -> None: + mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + # Run and validate parsing for Mim2Gene file + run_and_validate_parsing(mim2gene_parser, mock_xref_dbi, 9, 9, 0, 0) + + # Check that no xrefs were added + check_row_count(mock_xref_dbi, "xref", 0) + +# Test case to check successful parsing of valid Mim2Gene data with existing mim and entrezgene xrefs +def test_successful_parsing_with_existing_xrefs(mock_xref_dbi: DBConnection, mim2gene_parser: Mim2GeneParser) -> None: + mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + populate_xref_db(mock_xref_dbi) + + # Check the row counts in the xref and dependent_xref tables before running the parser + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "dependent_xref", 0) + + # Run and validate parsing for Mim2Gene file + run_and_validate_parsing(mim2gene_parser, mock_xref_dbi, 9, 4, 3, 2) + + # Check the row counts in the xref and dependent_xref tables + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "dependent_xref", 3) + + # Check the link between an xref and dependent_xref + check_dependent_xref_link(mock_xref_dbi, "100640", 9) + check_dependent_xref_link(mock_xref_dbi, "100100", 10) + + # Run and validate re-parsing for Mim2Gene file + run_and_validate_parsing(mim2gene_parser, mock_xref_dbi, 9, 4, 3, 2, "Re-parsing: ") + + # Check the row counts in the xref and dependent_xref tables + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "dependent_xref", 3) diff --git a/src/python/test/xrefs/parsers/test_mim_parser.py b/src/python/test/xrefs/parsers/test_mim_parser.py new file mode 100644 index 000000000..676c182bf --- /dev/null +++ b/src/python/test/xrefs/parsers/test_mim_parser.py @@ -0,0 +1,126 @@ +import pytest +from unittest.mock import MagicMock, patch +from typing import Callable + +from ensembl.production.xrefs.parsers.MIMParser import MIMParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_synonym + +# Constants +SOURCE_ID_MIM = 1 +SOURCE_ID_MIM_GENE = 2 +SOURCE_ID_MIM_MORBID = 3 +SPECIES_ID_HUMAN = 9606 + +# Fixture to create a MIMParser instance +@pytest.fixture +def mim_parser() -> MIMParser: + return MIMParser(True) + +# Mock for get_source_id_for_source_name +def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection) -> int: + if source_name == "MIM_GENE": + return SOURCE_ID_MIM_GENE + elif source_name == "MIM_MORBID": + return SOURCE_ID_MIM_MORBID + else: + return SOURCE_ID_MIM + +# Function to run and validate the parsing process +def run_and_validate_parsing(mim_parser: MIMParser, mock_xref_dbi: DBConnection, expected_genemap_xrefs: int, expected_phenotype_xrefs: int, expected_synonyms: int, expected_removed_entries: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = mim_parser.run( + { + "source_id": SOURCE_ID_MIM, + "species_id": SPECIES_ID_HUMAN, + "file": "parsers/flatfiles/mim.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing MIM data" + assert ( + f"{expected_genemap_xrefs} genemap and {expected_phenotype_xrefs} phenotype MIM xrefs added" in result_message + ), f"{prefix}Expected '{expected_genemap_xrefs} genemap and {expected_phenotype_xrefs} phenotype MIM xrefs added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_synonyms} synonyms (defined by MOVED TO) added" in result_message + ), f"{prefix}Expected '{expected_synonyms} synonyms (defined by MOVED TO) added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_removed_entries} entries removed" in result_message + ), f"{prefix}Expected '{expected_removed_entries} entries removed' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_mim_no_source_id(mim_parser: MIMParser, test_no_source_id: Callable[[MIMParser, int], None]) -> None: + test_no_source_id(mim_parser, SPECIES_ID_HUMAN) + +def test_mim_no_species_id(mim_parser: MIMParser, test_no_species_id: Callable[[MIMParser, int], None]) -> None: + test_no_species_id(mim_parser, SOURCE_ID_MIM) + +def test_mim_no_file(mim_parser: MIMParser, test_no_file: Callable[[MIMParser, int, int], None]) -> None: + test_no_file(mim_parser, SOURCE_ID_MIM, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_mim_file_not_found(mim_parser: MIMParser, test_file_not_found: Callable[[MIMParser, int, int], None]) -> None: + test_file_not_found(mim_parser, SOURCE_ID_MIM, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the TI field is missing +def test_missing_ti_field(mim_parser: MIMParser) -> None: + mock_file_content = [ + "*RECORD*\n*FIELD*\nNO\n100050\n" + ] + mim_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + with patch.object(MIMParser, 'get_file_sections', return_value=mock_file_content): + with pytest.raises(ValueError, match="Failed to extract TI field from record"): + mim_parser.run( + { + "source_id": SOURCE_ID_MIM, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the TI field has an invalid format +def test_invalid_ti_field(mim_parser: MIMParser) -> None: + mock_file_content = [ + "*RECORD*\n*FIELD*\nNO\n100050\n*FIELD*\nTI\nAARSKOG SYNDROME, AUTOSOMAL DOMINANT\n*FIELD*\nTX\n\nDESCRIPTION\n\nAarskog syndrome is characterized by short stature and facial, limb,\n\n*THEEND*\n" + ] + mim_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + with patch.object(MIMParser, 'get_file_sections', return_value=mock_file_content): + with pytest.raises(ValueError, match="Failed to extract record type and description from TI field"): + mim_parser.run( + { + "source_id": SOURCE_ID_MIM, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid MIM data +def test_successful_parsing(mock_xref_dbi: DBConnection, mim_parser: MIMParser) -> None: + mim_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + # Run and validate parsing for MIM file + run_and_validate_parsing(mim_parser, mock_xref_dbi, 2, 4, 2, 1) + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "synonym", 4) + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "200150", SOURCE_ID_MIM_GENE, "100500") + check_synonym(mock_xref_dbi, "200150", SOURCE_ID_MIM_MORBID, "100650") + + # Check for re-parsing of the same file + run_and_validate_parsing(mim_parser, mock_xref_dbi, 2, 4, 2, 1, "Re-parsing: ") + + # Re-check the row counts in the xref and synonym tables after re-parsing + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}") + check_row_count(mock_xref_dbi, "synonym", 4) diff --git a/src/python/test/xrefs/parsers/test_mirbase_parser.py b/src/python/test/xrefs/parsers/test_mirbase_parser.py new file mode 100644 index 000000000..f9c426c3a --- /dev/null +++ b/src/python/test/xrefs/parsers/test_mirbase_parser.py @@ -0,0 +1,111 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.miRBaseParser import miRBaseParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_sequence + +# Constants +SOURCE_ID_MIRBASE = 1 +SPECIES_ID_C_ELEGANS = 6239 +SPECIES_NAME_C_ELEGANS = "caenorhabditis_elegans" +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create a miRBaseParser instance +@pytest.fixture +def mirbase_parser() -> miRBaseParser: + return miRBaseParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(mirbase_parser: miRBaseParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = mirbase_parser.run( + { + "source_id": SOURCE_ID_MIRBASE, + "species_id": SPECIES_ID_C_ELEGANS, + "species_name": SPECIES_NAME_C_ELEGANS, + "file": "parsers/flatfiles/mirbase.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing miRBase data" + assert ( + f"Read {expected_xrefs} xrefs from" in result_message + ), f"{prefix}Expected 'Read {expected_xrefs} xrefs from' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_mirbase_no_source_id(mirbase_parser: miRBaseParser, test_no_source_id: Callable[[miRBaseParser, int], None]) -> None: + test_no_source_id(mirbase_parser, SPECIES_ID_C_ELEGANS) + +def test_mirbase_no_species_id(mirbase_parser: miRBaseParser, test_no_species_id: Callable[[miRBaseParser, int], None]) -> None: + test_no_species_id(mirbase_parser, SOURCE_ID_MIRBASE) + +def test_mirbase_no_file(mirbase_parser: miRBaseParser, test_no_file: Callable[[miRBaseParser, int, int], None]) -> None: + test_no_file(mirbase_parser, SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS) + +# Test case to check if an error is raised when the file is not found +def test_mirbase_file_not_found(mirbase_parser: miRBaseParser, test_file_not_found: Callable[[miRBaseParser, int, int], None]) -> None: + mirbase_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_C_ELEGANS: [SPECIES_NAME_C_ELEGANS]}) + test_file_not_found(mirbase_parser, SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS) + +# Test case to check if parsing is skipped when no species name can be found +def test_no_species_name(mock_xref_dbi: DBConnection, mirbase_parser: miRBaseParser) -> None: + mirbase_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) + + result_code, result_message = mirbase_parser.run( + { + "source_id": SOURCE_ID_MIRBASE, + "species_id": SPECIES_ID_C_ELEGANS, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing miRBase data" + assert ( + "Skipped. Could not find species ID to name mapping" in result_message + ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'" + +# Test case to check if no xrefs are added when the species name provided is not in the file +def test_no_xrefs_added(mock_xref_dbi: DBConnection, mirbase_parser: miRBaseParser) -> None: + mirbase_parser.species_id_to_names = MagicMock(return_value={}) + + result_code, result_message = mirbase_parser.run( + { + "source_id": SOURCE_ID_MIRBASE, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": f"parsers/flatfiles/mirbase.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing miRBase data" + assert "No xrefs added" in result_message, f"Expected 'No xrefs added' in result_message, but got: '{result_message}'" + +# Test case to check successful parsing of valid miRBase data +def test_successful_parsing(mock_xref_dbi: DBConnection, mirbase_parser: miRBaseParser) -> None: + mirbase_parser.species_id_to_names = MagicMock(return_value={}) + + # Run and validate parsing for miRBase file + run_and_validate_parsing(mirbase_parser, mock_xref_dbi, 6) + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_MIRBASE}") + check_row_count(mock_xref_dbi, "primary_xref", 6) + + # Check the sequences for specific accessions + check_sequence(mock_xref_dbi, "MI0000002", SOURCE_ID_MIRBASE, "ATGCTTCCGGCCTGTTCCCTGAGACCTCAAGTGTGAGTGTACTATTGATGCTTCACACCTGGGCTCTCCGGGTACCAGGACGGTTTGAGCAGAT") + check_sequence(mock_xref_dbi, "MI0000006", SOURCE_ID_MIRBASE, "TCTCGGATCAGATCGAGCCATTGCTGGTTTCTTCCACAGTGGTACTTTCCATTAGAACTATCACCGGGTGGAAACTAGCAGTGGCTCGATCTTTTCC") + + # Run and validate parsing for miRBase file + run_and_validate_parsing(mirbase_parser, mock_xref_dbi, 6, "Re-parsing: ") + + # Check the row counts in the xref and synonym tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_MIRBASE}") + check_row_count(mock_xref_dbi, "primary_xref", 6) diff --git a/src/python/test/xrefs/parsers/test_reactome_parser.py b/src/python/test/xrefs/parsers/test_reactome_parser.py new file mode 100644 index 000000000..9187fde0e --- /dev/null +++ b/src/python/test/xrefs/parsers/test_reactome_parser.py @@ -0,0 +1,166 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable +from sqlalchemy import text + +from ensembl.production.xrefs.parsers.ReactomeParser import ReactomeParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_release + +# Constants +SOURCE_ID_REACTOME = 1 +SOURCE_ID_REACTOME_DIRECT = 2 +SOURCE_ID_REACTOME_UNIPROT = 3 +SOURCE_ID_REACTOME_GENE = 4 +SOURCE_ID_REACTOME_TRANSCRIPT = 5 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create a ReactomeParser instance +@pytest.fixture +def reactome_parser() -> ReactomeParser: + return ReactomeParser(True) + +# Function to populate the database with sources +def populate_xref_db(mock_xref_dbi: DBConnection): + source_data = [ + [SOURCE_ID_REACTOME, 'reactome', 10, ''], + [SOURCE_ID_REACTOME_TRANSCRIPT, 'reactome_transcript', 10, ''], + [SOURCE_ID_REACTOME_GENE, 'reactome_gene', 10, ''], + [SOURCE_ID_REACTOME_DIRECT, 'reactome', 10, 'direct'], + [SOURCE_ID_REACTOME_UNIPROT, 'reactome', 10, 'uniprot'], + ] + for row in source_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO source (source_id, name, ordered, priority_description) + VALUES (:source_id, :name, :ordered, :priority_description) + """ + ), + { + "source_id": row[0], + "name": row[1], + "ordered": row[2], + "priority_description": row[3], + } + ) + +# Function to run and validate the parsing process +def run_and_validate_parsing(reactome_parser: ReactomeParser, mock_xref_dbi: DBConnection, file: str, expected_processed: int, expected_dependent: int, expected_direct: int, expected_errors: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = reactome_parser.run( + { + "source_id": SOURCE_ID_REACTOME, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": f"parsers/flatfiles/{file}.txt", + "rel_file": "parsers/flatfiles/reactome_release.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing Reactome data" + assert ( + f"{expected_processed} Reactome entries processed" in result_message + ), f"{prefix}Expected '{expected_processed} Reactome entries processed' in result_message, but got: '{result_message}'" + assert ( + f"{expected_dependent} dependent xrefs added" in result_message + ), f"{prefix}Expected '{expected_dependent} dependent xrefs added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_direct} direct xrefs added" in result_message + ), f"{prefix}Expected '{expected_direct} direct xrefs added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_errors} not found" in result_message + ), f"{prefix}Expected '{expected_errors} not found' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_reactome_no_source_id(reactome_parser: ReactomeParser, test_no_source_id: Callable[[ReactomeParser, int], None]) -> None: + test_no_source_id(reactome_parser, SPECIES_ID_HUMAN) + +def test_reactome_no_species_id(reactome_parser: ReactomeParser, test_no_species_id: Callable[[ReactomeParser, int], None]) -> None: + test_no_species_id(reactome_parser, SOURCE_ID_REACTOME) + +def test_reactome_no_file(reactome_parser: ReactomeParser, test_no_file: Callable[[ReactomeParser, int, int], None]) -> None: + test_no_file(reactome_parser, SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) + +# Test case to check if parsing is skipped when no species name can be found +def test_no_species_name(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: + result_code, result_message = reactome_parser.run( + { + "source_id": SOURCE_ID_REACTOME, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing Reactome data" + assert ( + "Skipped. Could not find species ID to name mapping" in result_message + ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'" + +# Test case to check if an error is raised when the required source_id is missing +def test_reactome_missing_required_source_id(reactome_parser: ReactomeParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[ReactomeParser, DBConnection, str, int, int, str], None]) -> None: + reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) + test_missing_required_source_id(reactome_parser, mock_xref_dbi, 'reactome', SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_reactome_file_not_found(reactome_parser: ReactomeParser, test_file_not_found: Callable[[ReactomeParser, int, int], None]) -> None: + reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) + test_file_not_found(reactome_parser, SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_reactome_empty_file(reactome_parser: ReactomeParser, test_empty_file: Callable[[ReactomeParser, str, int, int], None]) -> None: + reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) + test_empty_file(reactome_parser, 'Reactome', SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) + +# Test case to check successful parsing of valid Reactome data without existing uniprot xrefs +def test_successful_parsing_without_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: + populate_xref_db(mock_xref_dbi) + + # Run and validate parsing for Uniprot and Ensembl Reactome files + run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 0, 0, 0) + run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_ensembl", 14, 0, 13, 1) + + # Check the row counts in the xref and direct_xref tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_GENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_TRANSCRIPT}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_DIRECT}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 6) + check_row_count(mock_xref_dbi, "transcript_direct_xref", 4) + check_row_count(mock_xref_dbi, "translation_direct_xref", 3) + + # Check the link between an xref and direct_xref tables + check_direct_xref_link(mock_xref_dbi, "gene", "R-HSA-1643685", "ENSG00000000419") + check_direct_xref_link(mock_xref_dbi, "transcript", "R-HSA-199991", "ENST00000000233") + check_direct_xref_link(mock_xref_dbi, "translation", "R-HSA-199991", "ENSP00000000233") + +# Test case to check successful parsing of valid Reactome data with existing uniprot xrefs +def test_successful_parsing_with_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: + populate_xref_db(mock_xref_dbi) + reactome_parser.get_valid_codes = MagicMock(return_value={"A0A075B6P5": [12], "A0A075B6S6" : [34, 56], "A0A087WPF7": [78], "A0A096LNF2": [90]}) + + # Run and validate re-parsing for Uniprot and Ensembl Reactome files + run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 6, 0, 0, "Re-parsing: ") + run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_ensembl", 14, 0, 13, 1, "Re-parsing: ") + + # Check the row counts in the xref, direct_xref, and dependent_xref tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_GENE}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_TRANSCRIPT}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_REACTOME_UNIPROT}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 6) + check_row_count(mock_xref_dbi, "transcript_direct_xref", 4) + check_row_count(mock_xref_dbi, "translation_direct_xref", 3) + check_row_count(mock_xref_dbi, "dependent_xref", 5) + + # Check the link between an xref and dependent_xref + check_dependent_xref_link(mock_xref_dbi, "R-HSA-1280218", 34) + check_dependent_xref_link(mock_xref_dbi, "R-HSA-1280218", 56) + check_dependent_xref_link(mock_xref_dbi, "R-HSA-166663", 90) + + # Check the release info + check_release(mock_xref_dbi, SOURCE_ID_REACTOME, "88") diff --git a/src/python/test/xrefs/parsers/test_refseq_parser.py b/src/python/test/xrefs/parsers/test_refseq_parser.py new file mode 100644 index 000000000..2b8a77f2c --- /dev/null +++ b/src/python/test/xrefs/parsers/test_refseq_parser.py @@ -0,0 +1,243 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable, Dict +from sqlalchemy import text + +from ensembl.production.xrefs.parsers.RefSeqParser import RefSeqParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_dependent_xref_link, check_sequence, check_release + +# Constants +SOURCE_ID_REFSEQ = 1 +SOURCE_ID_REFSEQ_MRNA = 2 +SOURCE_ID_REFSEQ_NCRNA = 3 +SOURCE_ID_REFSEQ_MRNA_PREDICTED = 4 +SOURCE_ID_REFSEQ_NCRNA_PREDICTED = 5 +SOURCE_ID_REFSEQ_PEPTIDE = 6 +SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED = 7 +SOURCE_ID_ENTREZGENE = 8 +SOURCE_ID_WIKIGENE = 9 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create a RefSeqParser instance +@pytest.fixture +def refseq_parser() -> RefSeqParser: + return RefSeqParser(True) + +# Function to populate the database with EntrezGene and WikiGene xrefs +def populate_xref_db(mock_xref_dbi: DBConnection): + source_data = [ + [SOURCE_ID_REFSEQ_MRNA, 'RefSeq_mRNA', 10, 'refseq'], + [SOURCE_ID_REFSEQ_MRNA_PREDICTED, 'RefSeq_mRNA_predicted', 10, 'refseq'], + [SOURCE_ID_REFSEQ_NCRNA, 'RefSeq_ncRNA', 10, ''], + [SOURCE_ID_REFSEQ_NCRNA_PREDICTED, 'RefSeq_ncRNA_predicted', 10, ''], + [SOURCE_ID_REFSEQ_PEPTIDE, 'RefSeq_peptide', 10, ''], + [SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED, 'RefSeq_peptide_predicted', 10, ''], + [SOURCE_ID_ENTREZGENE, 'EntrezGene', 10, ''], + [SOURCE_ID_WIKIGENE, 'WikiGene', 10, ''], + ] + for row in source_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO source (source_id, name, ordered, priority_description) + VALUES (:source_id, :name, :ordered, :priority_description) + """ + ), + { + "source_id": row[0], + "name": row[1], + "ordered": row[2], + "priority_description": row[3], + } + ) + + xref_data = [ + [1, '105373289', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'LOC105373289'], + [2, '105373289', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'LOC105373289'], + [3, '100128640', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'ACVR2B-AS1'], + [4, '100128640', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'ACVR2B-AS1'], + [5, '102465874', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'MIR8075'], + [6, '102465874', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'MIR8075'], + [7, '401447', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L1'], + [8, '401447', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L1'], + [9, '728393', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L27'], + [10, '728393', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L27'], + ] + for row in xref_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO xref (xref_id, accession, source_id, species_id, info_type, label) + VALUES (:xref_id, :accession, :source_id, :species_id, :info_type, :label) + """ + ), + { + "xref_id": row[0], + "accession": row[1], + "source_id": row[2], + "species_id": row[3], + "info_type": row[4], + "label": row[5], + } + ) + + mock_xref_dbi.commit() + +# Mock for get_source_id_for_source_name +def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection, desc: str = None) -> int: + source_mapping = { + "RefSeq_peptide": SOURCE_ID_REFSEQ_PEPTIDE, + "RefSeq_mRNA": SOURCE_ID_REFSEQ_MRNA, + "RefSeq_ncRNA": SOURCE_ID_REFSEQ_NCRNA, + "RefSeq_peptide_predicted": SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED, + "RefSeq_mRNA_predicted": SOURCE_ID_REFSEQ_MRNA_PREDICTED, + "RefSeq_ncRNA_predicted": SOURCE_ID_REFSEQ_NCRNA_PREDICTED, + "EntrezGene": SOURCE_ID_ENTREZGENE, + "WikiGene": SOURCE_ID_WIKIGENE, + } + return source_mapping.get(source_name, SOURCE_ID_REFSEQ) + +# Function to run and validate the parsing process +def run_and_validate_parsing(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, file:str, expected_xrefs: Dict[str, int], prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = refseq_parser.run( + { + "source_id": SOURCE_ID_REFSEQ, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": f"parsers/flatfiles/{file}.txt", + "rel_file": "parsers/flatfiles/refseq_release.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + mrna = expected_xrefs["num_mrna"] + mrna_pred = expected_xrefs["num_pred_mrna"] + ncrna = expected_xrefs["num_ncrna"] + ncrna_pred = expected_xrefs["num_pred_ncrna"] + peptide = expected_xrefs["num_peptide"] + peptide_pred = expected_xrefs["num_pred_peptide"] + entrez = expected_xrefs["num_entrez"] + wiki = expected_xrefs["num_wiki"] + + assert result_code == 0, f"{prefix}Errors when parsing RefSeq GPFF data" + assert ( + f"Added {mrna} mRNA xrefs, {mrna_pred} predicted mRNA xrefs," in result_message + ), f"{prefix}Expected 'Added {mrna} mRNA xrefs, {mrna_pred} predicted mRNA xrefs,' in result_message, but got: '{result_message}'" + assert ( + f"{ncrna} ncRNA xrefs, {ncrna_pred} predicted ncRNA xrefs," in result_message + ), f"{prefix}Expected '{ncrna} ncRNA xrefs, {ncrna_pred} predicted ncRNA xrefs,' in result_message, but got: '{result_message}'" + assert ( + f"{peptide} peptide xrefs, and {peptide_pred} predicted peptide xrefs" in result_message + ), f"{prefix}Expected '{peptide} peptide xrefs, and {peptide_pred} predicted peptide xref' in result_message, but got: '{result_message}'" + assert ( + f"EntrezGene\t{entrez}" in result_message + ), f"{prefix}Expected 'EntrezGene\t{entrez}' in result_message, but got: '{result_message}'" + assert ( + f"WikiGene\t{wiki}" in result_message + ), f"{prefix}Expected 'WikiGene\t{wiki}' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_refseq_no_source_id(refseq_parser: RefSeqParser, test_no_source_id: Callable[[RefSeqParser, int], None]) -> None: + test_no_source_id(refseq_parser, SPECIES_ID_HUMAN) + +def test_refseq_no_species_id(refseq_parser: RefSeqParser, test_no_species_id: Callable[[RefSeqParser, int], None]) -> None: + test_no_species_id(refseq_parser, SOURCE_ID_REFSEQ_MRNA) + +def test_refseq_no_file(refseq_parser: RefSeqParser, test_no_file: Callable[[RefSeqParser, int, int], None]) -> None: + test_no_file(refseq_parser, SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the required source_id is missing +def test_mim2gene_missing_required_source_id(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RefSeqParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(refseq_parser, mock_xref_dbi, 'RefSeq_peptide', SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) + +# Test case to check if parsing is skipped when no species name can be found +def test_no_species_name(mock_xref_dbi: DBConnection, refseq_parser: RefSeqParser) -> None: + refseq_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + result_code, result_message = refseq_parser.run( + { + "source_id": SOURCE_ID_REFSEQ, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing RefSeq data" + assert ( + "Skipped. Could not find species ID to name mapping" in result_message + ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'" + +# Test case to check if parsing is skipped if file type is not supported +def test_invalid_file_type(mock_xref_dbi: DBConnection, refseq_parser: RefSeqParser) -> None: + refseq_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + + result_code, result_message = refseq_parser.run( + { + "source_id": SOURCE_ID_REFSEQ, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing RefSeq data" + assert ( + "Skipped. Could not work out sequence type" in result_message + ), f"Expected 'Skipped. Could not work out sequence type' in result_message, but got: '{result_message}'" + +# Test case to check if an error is raised when the file is not found +def test_refseq_file_not_found(refseq_parser: RefSeqParser, test_file_not_found: Callable[[RefSeqParser, int, int], None]) -> None: + refseq_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) + refseq_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) + refseq_parser.type_from_file = MagicMock(return_value="dna") + + test_file_not_found(refseq_parser, SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) + +# Test case to check successful parsing of valid RefSeq GPFF data +def test_successful_parsing(mock_xref_dbi: DBConnection, refseq_parser: RefSeqParser) -> None: + populate_xref_db(mock_xref_dbi) + + # Check the row counts in the xref table before running the parser + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}") + check_row_count(mock_xref_dbi, "dependent_xref", 0) + + # Run and validate parsing for RefSeq dna and peptide files + expected_counts = {"num_mrna": 5, "num_pred_mrna": 2, "num_ncrna": 2, "num_pred_ncrna": 1, "num_peptide": 0, "num_pred_peptide": 0, "num_entrez": 5, "num_wiki": 5} + run_and_validate_parsing(refseq_parser, mock_xref_dbi, "refseq_rna", expected_counts) + expected_counts = {"num_mrna": 0, "num_pred_mrna": 0, "num_ncrna": 0, "num_pred_ncrna": 0, "num_peptide": 5, "num_pred_peptide": 3, "num_entrez": 2, "num_wiki": 2} + run_and_validate_parsing(refseq_parser, mock_xref_dbi, "refseq_protein", expected_counts) + + # Check the row counts in the xref, dependent_xref, and primary_xref tables + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_MRNA}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_MRNA_PREDICTED}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_NCRNA}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_NCRNA_PREDICTED}") + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_PEPTIDE}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED}") + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}") + check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}") + check_row_count(mock_xref_dbi, "dependent_xref", 16) + check_row_count(mock_xref_dbi, "primary_xref", 18) + + # Check the link between an xref and dependent_xref + master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='NR_168385' AND source_id={SOURCE_ID_REFSEQ_NCRNA}")).scalar() + check_dependent_xref_link(mock_xref_dbi, "105373289", master_xref_id) + master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='NP_001229259' AND source_id={SOURCE_ID_REFSEQ_PEPTIDE}")).scalar() + check_dependent_xref_link(mock_xref_dbi, "728393", master_xref_id) + master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='NM_001242328' AND source_id={SOURCE_ID_REFSEQ_MRNA}")).scalar() + check_dependent_xref_link(mock_xref_dbi, "728393", master_xref_id) + + # Check the sequences for specific accessions + check_sequence(mock_xref_dbi, "NM_039939", SOURCE_ID_REFSEQ_MRNA, "taaatgtcttactgcttttactgttccctcctagagtccattctttactctaggagggaatagtaaaagcagtaagacattta") + check_sequence(mock_xref_dbi, "NP_001355183", SOURCE_ID_REFSEQ_PEPTIDE, "mllmvvsmacvglflvqragphmggqdkpflsawpsavvprgghvtlrchyrhrfnnfmlykedrihvpifhgrifqegfnmspvttahagnytcrgshphsptgwsapsnpmvimvtgnhrwcsnkkkcccngpracreqk") + + # Check the release info + check_release(mock_xref_dbi, SOURCE_ID_REFSEQ_MRNA, "NCBI Reference Sequence (RefSeq) Database Release 224, May 6, 2024") \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_rfam_parser.py b/src/python/test/xrefs/parsers/test_rfam_parser.py new file mode 100644 index 000000000..86caa9669 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_rfam_parser.py @@ -0,0 +1,130 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.RFAMParser import RFAMParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link + +# Constants +SOURCE_ID_RFAM = 1 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create an RFAMParser instance +@pytest.fixture +def rfam_parser() -> RFAMParser: + return RFAMParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(rfam_parser: RFAMParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_direct_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = rfam_parser.run( + { + "source_id": SOURCE_ID_RFAM, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": "parsers/flatfiles/rfam.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing RFAM data" + assert ( + f"Added {expected_xrefs} RFAM xrefs and {expected_direct_xrefs} direct xrefs" in result_message + ), f"{prefix}Expected 'Added {expected_xrefs} RFAM xrefs and {expected_direct_xrefs} direct xrefs' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_rfam_no_source_id(rfam_parser: RFAMParser, test_no_source_id: Callable[[RFAMParser, int], None]) -> None: + test_no_source_id(rfam_parser, SPECIES_ID_HUMAN) + +def test_rfam_no_species_id(rfam_parser: RFAMParser, test_no_species_id: Callable[[RFAMParser, int], None]) -> None: + test_no_species_id(rfam_parser, SOURCE_ID_RFAM) + +def test_rfam_no_file(rfam_parser: RFAMParser, test_no_file: Callable[[RFAMParser, int, int], None]) -> None: + test_no_file(rfam_parser, SOURCE_ID_RFAM, SPECIES_ID_HUMAN) + +# Test case to check if parsing is skipped when no species name can be found +def test_no_species_name(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None: + result_code, result_message = rfam_parser.run( + { + "source_id": SOURCE_ID_RFAM, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"Errors when parsing RFAM data" + assert ( + "Skipped. Could not find species ID to name mapping" in result_message + ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'" + +# Test case to check if an error is raised when no RFAM database is provided +def test_no_rfam_db(rfam_parser: RFAMParser) -> None: + rfam_parser.get_db_from_registry = MagicMock(return_value=None) + + with pytest.raises( + AttributeError, match="Could not find RFAM DB." + ): + rfam_parser.run( + { + "source_id": SOURCE_ID_RFAM, + "species_id": SPECIES_ID_HUMAN, + "species_name": SPECIES_NAME_HUMAN, + "file": "dummy_file.txt", + "ensembl_release": "100", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check if an error is raised when the file is not found +def test_rfam_file_not_found(rfam_parser: RFAMParser, test_file_not_found: Callable[[RFAMParser, int, int], None]) -> None: + rfam_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) + rfam_parser.get_rfam_db_url = MagicMock(return_value="mock_rfam_db_url") + rfam_parser.get_rfam_transcript_stable_ids = MagicMock(return_value={}) + test_file_not_found(rfam_parser, SOURCE_ID_RFAM, SPECIES_ID_HUMAN) + +# Test case to check successful parsing of valid RFAM data without existing RFAM xrefs in RFAM DB +def test_successful_parsing_without_existing_rfam_data(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None: + rfam_parser.get_rfam_db_url = MagicMock(return_value="mock_rfam_db_url") + rfam_parser.get_rfam_transcript_stable_ids = MagicMock(return_value={}) + + # Run and validate parsing for MIM file + run_and_validate_parsing(rfam_parser, mock_xref_dbi, 0, 0) + + # Check the row counts in the xref and transcript_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 0) + check_row_count(mock_xref_dbi, "transcript_direct_xref", 0) + +# Test case to check successful parsing of valid RFAM data with existing RFAM xrefs in RFAM DB +def test_successful_parsing_with_existing_rfam_data(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None: + # Run parsing without existing values in RFAM DB + rfam_parser.get_rfam_db_url = MagicMock(return_value="mock_rfam_db_url") + rfam_parser.get_rfam_transcript_stable_ids = MagicMock(return_value={ + "RF00001": ["ENST00000516887", "ENST00000516971", "ENST00000622298", "ENST00000674448"], + "RF00002": ["ENST00000363564", "ENST00000515896"], + "RF00003": ["ENST00000353977"], + "RF00006": ["ENST00000362552", "ENST00000363120", "ENST00000365241", "ENST00000365645", "ENST00000516091"] + }) + + # Run and validate parsing for RFAM file + run_and_validate_parsing(rfam_parser, mock_xref_dbi, 4, 12) + + # Check the row counts in the xref and transcript_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 4) + check_row_count(mock_xref_dbi, "transcript_direct_xref", 12) + + # Check the link between an xref and transcript_direct_xref + check_direct_xref_link(mock_xref_dbi, "transcript", "RF00002", "ENST00000515896") + check_direct_xref_link(mock_xref_dbi, "transcript", "RF00006", "ENST00000362552") + check_direct_xref_link(mock_xref_dbi, "transcript", "RF00006", "ENST00000365645") + + # Run and validate re-parsing for RFAM file + run_and_validate_parsing(rfam_parser, mock_xref_dbi, 4, 12, "Re-parsing: ") + + # Check the row counts in the xref and transcript_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 4) + check_row_count(mock_xref_dbi, "transcript_direct_xref", 12) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_rgd_parser.py b/src/python/test/xrefs/parsers/test_rgd_parser.py new file mode 100644 index 000000000..2b8019c3f --- /dev/null +++ b/src/python/test/xrefs/parsers/test_rgd_parser.py @@ -0,0 +1,126 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.RGDParser import RGDParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_synonym + +# Constants +SOURCE_ID_RGD = 1 +SOURCE_ID_DIRECT = 2 +SPECIES_ID_RAT = 10116 + +# Fixture to create an RGDParser instance +@pytest.fixture +def rgd_parser() -> RGDParser: + return RGDParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(rgd_parser: RGDParser, mock_xref_dbi: DBConnection, expected_dependent_xrefs: int, expected_direct_xrefs: int, expected_mismatch: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = rgd_parser.run( + { + "source_id": SOURCE_ID_RGD, + "species_id": SPECIES_ID_RAT, + "file": "parsers/flatfiles/rgd.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing RGD data" + assert ( + f"{expected_dependent_xrefs} xrefs successfully loaded and dependent on refseq" in result_message + ), f"{prefix}Expected '{expected_dependent_xrefs} xrefs successfully loaded and dependent on refseq' in result_message, but got: '{result_message}'" + assert ( + f"{expected_mismatch} xrefs added but with NO dependencies" in result_message + ), f"{prefix}Expected '{expected_mismatch} xrefs added but with NO dependencies' in result_message, but got: '{result_message}'" + assert ( + f"{expected_direct_xrefs} direct xrefs successfully loaded" in result_message + ), f"{prefix}Expected '{expected_direct_xrefs} direct xrefs successfully loaded' in result_message, but got: '{result_message}'" + assert ( + f"Added {expected_synonyms} synonyms, including duplicates" in result_message + ), f"{prefix}Expected 'Added {expected_synonyms} synonyms, including duplicates' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_rgd_no_source_id(rgd_parser: RGDParser, test_no_source_id: Callable[[RGDParser, int], None]) -> None: + test_no_source_id(rgd_parser, SPECIES_ID_RAT) + +def test_rgd_no_species_id(rgd_parser: RGDParser, test_no_species_id: Callable[[RGDParser, int], None]) -> None: + test_no_species_id(rgd_parser, SOURCE_ID_RGD) + +def test_rgd_no_file(rgd_parser: RGDParser, test_no_file: Callable[[RGDParser, int, int], None]) -> None: + test_no_file(rgd_parser, SOURCE_ID_RGD, SPECIES_ID_RAT) + +# Test case to check if an error is raised when the file is not found +def test_rgd_file_not_found(rgd_parser: RGDParser, test_file_not_found: Callable[[RGDParser, int, int], None]) -> None: + rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT) + test_file_not_found(rgd_parser, SOURCE_ID_RGD, SPECIES_ID_RAT) + +# Test case to check if an error is raised when the file is empty +def test_rgd_empty_file(rgd_parser: RGDParser, test_empty_file: Callable[[RGDParser, str, int, int], None]) -> None: + rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT) + test_empty_file(rgd_parser, 'RGD', SOURCE_ID_RGD, SPECIES_ID_RAT) + +# Test case to check if an error is raised when the required source_id is missing +def test_rgd_missing_required_source_id(rgd_parser: RGDParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RGDParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(rgd_parser, mock_xref_dbi, 'RGD', SOURCE_ID_RGD, SPECIES_ID_RAT) + +# Test case to check successful parsing of valid RGD data without existing refseqs +def test_successful_parsing_without_refseqs(mock_xref_dbi: DBConnection, rgd_parser: RGDParser) -> None: + rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT) + + # Run and validate parsing for RGD file without existing refseqs + run_and_validate_parsing(rgd_parser, mock_xref_dbi, 0, 5, 2, 6) + + # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 0, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_RGD}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='MISC' AND source_id={SOURCE_ID_RGD}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 5) + check_row_count(mock_xref_dbi, "dependent_xref", 0) + check_row_count(mock_xref_dbi, "synonym", 4) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "2004", "ENSRNOG00000028896") + +# Test case to check successful parsing of valid RGD data with refseqs +def test_successful_parsing_with_refseqs(mock_xref_dbi: DBConnection, rgd_parser: RGDParser) -> None: + rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT) + rgd_parser.get_valid_codes = MagicMock(return_value={"NM_052979": [12, 34], "XM_039101774" : [56], "XM_063281326": [78]}) + + # Run and validate parsing for RGD file with existing refseqs + run_and_validate_parsing(rgd_parser, mock_xref_dbi, 3, 5, 1, 12) + + # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_RGD}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_RGD}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 5) + check_row_count(mock_xref_dbi, "dependent_xref", 3) + check_row_count(mock_xref_dbi, "synonym", 8) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "2012", "ENSRNOG00000009845") + + # Check the link between an xref and dependent_xref + check_dependent_xref_link(mock_xref_dbi, "2003", 12) + check_dependent_xref_link(mock_xref_dbi, "2003", 34) + check_dependent_xref_link(mock_xref_dbi, "2007", 56) + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "2003", SOURCE_ID_DIRECT, "ASP") + check_synonym(mock_xref_dbi, "2007", SOURCE_ID_RGD, "PMP70, 70-kDa peroxisomal membrane protein") + + # Run and validate re-parsing for RGD file + run_and_validate_parsing(rgd_parser, mock_xref_dbi, 3, 5, 1, 12, "Re-parsing: ") + + # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_RGD}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_RGD}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 5) + check_row_count(mock_xref_dbi, "dependent_xref", 3) + check_row_count(mock_xref_dbi, "synonym", 8) diff --git a/src/python/test/xrefs/parsers/test_ucsc_parser.py b/src/python/test/xrefs/parsers/test_ucsc_parser.py new file mode 100644 index 000000000..ae96e4d3f --- /dev/null +++ b/src/python/test/xrefs/parsers/test_ucsc_parser.py @@ -0,0 +1,89 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.UCSCParser import UCSCParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count + +# Constants +SOURCE_ID_UCSC = 1 +SPECIES_ID_HUMAN = 9606 + +# Fixture to create a UCSCParser instance +@pytest.fixture +def ucsc_parser() -> UCSCParser: + return UCSCParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(ucsc_parser: UCSCParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = ucsc_parser.run( + { + "source_id": SOURCE_ID_UCSC, + "species_id": SPECIES_ID_HUMAN, + "file": "parsers/flatfiles/ucsc.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing UCSC data" + assert ( + f"Loaded a total of {expected_xrefs} UCSC xrefs" in result_message + ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} UCSC xrefs' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_ucsc_no_source_id(ucsc_parser: UCSCParser, test_no_source_id: Callable[[UCSCParser, int], None]) -> None: + test_no_source_id(ucsc_parser, SPECIES_ID_HUMAN) + +def test_ucsc_no_species_id(ucsc_parser: UCSCParser, test_no_species_id: Callable[[UCSCParser, int], None]) -> None: + test_no_species_id(ucsc_parser, SOURCE_ID_UCSC) + +def test_ucsc_no_file(ucsc_parser: UCSCParser, test_no_file: Callable[[UCSCParser, int, int], None]) -> None: + test_no_file(ucsc_parser, SOURCE_ID_UCSC, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_ucsc_file_not_found(ucsc_parser: UCSCParser, test_file_not_found: Callable[[UCSCParser, int, int], None]) -> None: + test_file_not_found(ucsc_parser, SOURCE_ID_UCSC, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is empty +def test_ucsc_empty_file(ucsc_parser: UCSCParser, test_empty_file: Callable[[UCSCParser, str, int, int], None]) -> None: + test_empty_file(ucsc_parser, 'UCSC', SOURCE_ID_UCSC, SPECIES_ID_HUMAN) + +# Parametrized test case to check if an error is raised for various missing keys +@pytest.mark.parametrize( + "line", [ + ("ENST00000619216.1\tchr1\t-\t17368\t17436\t17368\t17368\t1\t17368,\t17436,\t\t\n"), + ("ENST00000619216.1\t \t-\t17368\t17436\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"), + ("ENST00000619216.1\tchr1\t\t17368\t17436\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"), + ("ENST00000619216.1\tchr1\t-\t\t17436\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"), + ("ENST00000619216.1\tchr1\t-\t17368\t\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"), + ("ENST00000619216.1\tchr1\t-\t17368\t17436\t17368\t17368\t1\t\t17436,\t\tuc031tla.1\n"), + ("ENST00000619216.1\tchr1\t-\t17368\t17436\t17368\t17368\t1\t17368,\t \t\tuc031tla.1\n"), + ], + ids=["accession column", "chromosome column", "strand column", "txStart column", "txEnd column", "exonStarts column", "exonEnds column"], +) +def test_missing_keys(ucsc_parser: UCSCParser, line: str) -> None: + mock_file = io.StringIO(line) + ucsc_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Missing required key for xref"): + ucsc_parser.run( + { + "source_id": SOURCE_ID_UCSC, + "species_id": SPECIES_ID_HUMAN, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + ) + +# Test case to check successful parsing of valid UCSC data +def test_successful_parsing(mock_xref_dbi: DBConnection, ucsc_parser: UCSCParser) -> None: + # Run and validate parsing for UCSC file + run_and_validate_parsing(ucsc_parser, mock_xref_dbi, 10) + + # Check the row counts in the coordinate_xref table + check_row_count(mock_xref_dbi, "coordinate_xref", 10) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_uniprot_parser.py b/src/python/test/xrefs/parsers/test_uniprot_parser.py new file mode 100644 index 000000000..0cf0e2cc7 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_uniprot_parser.py @@ -0,0 +1,181 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable, Dict +from sqlalchemy import text + +from ensembl.production.xrefs.parsers.UniProtParser import UniProtParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_synonym, check_direct_xref_link, check_dependent_xref_link, check_sequence, check_release + +# Constants +SOURCE_ID_UNIPROT = 1 +SOURCE_ID_SWISSPROT = 2 +SOURCE_ID_TREMBL = 3 +SOURCE_ID_TREMBL_NON_DISPLAY = 4 +SOURCE_ID_SWISSPROT_DIRECT = 5 +SOURCE_ID_TREMBL_DIRECT = 6 +SOURCE_ID_ISOFORM = 7 +SOURCE_ID_PDB = 8 +SOURCE_ID_STRING = 9 +SOURCE_ID_EMBL = 10 +SOURCE_ID_BIOGRID = 11 +SOURCE_ID_CHEMBL = 12 +SOURCE_ID_UNIPROT_GN = 13 +SOURCE_ID_PROTEIN_ID = 14 +SPECIES_ID_HUMAN = 9606 +SPECIES_NAME_HUMAN = "homo_sapiens" + +# Fixture to create a UniProtParser instance +@pytest.fixture +def uniprot_parser() -> UniProtParser: + return UniProtParser(True) + +# Function to populate the database with sources +def populate_xref_db(mock_xref_dbi: DBConnection): + source_data = [ + [SOURCE_ID_SWISSPROT, 'Uniprot/SWISSPROT', 10, 'sequence_mapped'], + [SOURCE_ID_TREMBL, 'Uniprot/SPTREMBL', 10, 'sequence_mapped'], + [SOURCE_ID_TREMBL_NON_DISPLAY, 'Uniprot/SPTREMBL', 10, 'protein_evidence_gt_2'], + [SOURCE_ID_SWISSPROT_DIRECT, 'Uniprot/SWISSPROT', 10, 'direct'], + [SOURCE_ID_TREMBL_DIRECT, 'Uniprot/SPTREMBL', 10, 'direct'], + [SOURCE_ID_ISOFORM, 'Uniprot_isoform', 10, ''], + [SOURCE_ID_PDB, 'PDB', 10, ''], + [SOURCE_ID_STRING, 'STRING', 10, ''], + [SOURCE_ID_EMBL, 'EMBL', 10, ''], + [SOURCE_ID_BIOGRID, 'BioGRID', 10, ''], + [SOURCE_ID_CHEMBL, 'ChEMBL', 10, ''], + [SOURCE_ID_UNIPROT_GN, 'Uniprot_gn', 10, ''], + [SOURCE_ID_PROTEIN_ID, 'protein_id', 10, ''], + ] + for row in source_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO source (source_id, name, ordered, priority_description) + VALUES (:source_id, :name, :ordered, :priority_description) + """ + ), + { + "source_id": row[0], + "name": row[1], + "ordered": row[2], + "priority_description": row[3], + } + ) + +# Function to run and validate the parsing process +def run_and_validate_parsing(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, file:str, expected_xrefs: Dict[str, int], expected_deps: Dict[str, int], prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = uniprot_parser.run( + { + "source_id": SOURCE_ID_UNIPROT, + "species_id": SPECIES_ID_HUMAN, + "file": f"parsers/flatfiles/{file}.txt", + "rel_file": "parsers/flatfiles/uniprot_release.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + sp = expected_xrefs["num_sp"] + sptr = expected_xrefs["num_sptr"] + sptr_non_display = expected_xrefs["num_sptr_non_display"] + direct_sp = expected_xrefs["num_direct_sp"] + direct_sptr = expected_xrefs["num_direct_sptr"] + isoform = expected_xrefs["num_isoform"] + skipped = expected_xrefs["num_skipped"] + + assert result_code == 0, f"{prefix}Errors when parsing UniProt data" + assert ( + f"Read {sp} SwissProt xrefs, {sptr} SPTrEMBL xrefs with protein evidence codes 1-2," in result_message + ), f"{prefix}Expected 'Read {sp} SwissProt xrefs, {sptr} SPTrEMBL xrefs with protein evidence codes 1-2,' in result_message, but got: '{result_message}'" + assert ( + f"and {sptr_non_display} SPTrEMBL xrefs with protein evidence codes > 2 from" in result_message + ), f"{prefix}Expected 'and {sptr_non_display} SPTrEMBL xrefs with protein evidence codes > 2 from' in result_message, but got: '{result_message}'" + assert ( + f"Added {direct_sp} direct SwissProt xrefs and {direct_sptr} direct SPTrEMBL xrefs" in result_message + ), f"{prefix}Expected 'Added {direct_sp} direct SwissProt xrefs and {direct_sptr} direct SPTrEMBL xrefs' in result_message, but got: '{result_message}'" + assert ( + f"Added {isoform} direct isoform xrefs" in result_message + ), f"{prefix}Expected 'Added {isoform} direct isoform xrefs' in result_message, but got: '{result_message}'" + assert ( + f"Skipped {skipped} ensembl annotations as Gene names" in result_message + ), f"{prefix}Expected 'Skipped {skipped} ensembl annotations as Gene names' in result_message, but got: '{result_message}'" + + for count_type, count in expected_deps.items(): + assert f"{count_type}\t{count}" in result_message, f"{prefix}Expected '{count_type}\t{count}' in result_meesgae, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_uniprot_no_source_id(uniprot_parser: UniProtParser, test_no_source_id: Callable[[UniProtParser, int], None]) -> None: + test_no_source_id(uniprot_parser, SPECIES_ID_HUMAN) + +def test_uniprot_no_species_id(uniprot_parser: UniProtParser, test_no_species_id: Callable[[UniProtParser, int], None]) -> None: + test_no_species_id(uniprot_parser, SOURCE_ID_UNIPROT) + +def test_uniprot_no_file(uniprot_parser: UniProtParser, test_no_file: Callable[[UniProtParser, int, int], None]) -> None: + test_no_file(uniprot_parser, SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the required source_id is missing +def test_uniprot_missing_required_source_id(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[UniProtParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(uniprot_parser, mock_xref_dbi, 'Uniprot/SWISSPROT', SOURCE_ID_SWISSPROT, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the file is not found +def test_uniprot_file_not_found(uniprot_parser: UniProtParser, test_file_not_found: Callable[[UniProtParser, int, int], None]) -> None: + test_file_not_found(uniprot_parser, SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN) + +# Test case to check successful parsing of valid UniProt data +def test_successful_parsing(mock_xref_dbi: DBConnection, uniprot_parser: UniProtParser) -> None: + populate_xref_db(mock_xref_dbi) + + # Run and validate parsing for UniProt SWISSPROT file + expected_counts = {"num_sp": 4, "num_sptr": 0, "num_sptr_non_display": 0, "num_direct_sp": 8, "num_direct_sptr": 0, "num_isoform": 6, "num_skipped": 1} + expected_deps = {"PDB": 50, "STRING": 4, "EMBL": 34, "BioGRID": 4, "ChEMBL": 4, "protein_id": 34, "Uniprot_gn": 3} + run_and_validate_parsing(uniprot_parser, mock_xref_dbi, "uniprot_swissprot", expected_counts, expected_deps) + + # Run and validate parsing for UniProt TREMBL file + expected_counts = {"num_sp": 0, "num_sptr": 1, "num_sptr_non_display": 8, "num_direct_sp": 0, "num_direct_sptr": 0, "num_isoform": 0, "num_skipped": 0} + expected_deps = {"EMBL": 49, "protein_id": 49, "Uniprot_gn": 7} + run_and_validate_parsing(uniprot_parser, mock_xref_dbi, "uniprot_trembl", expected_counts, expected_deps) + + # Check the row counts in the xref, translation_direct_xref, dependent_xref, primary_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_SWISSPROT}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_SWISSPROT_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 1, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_TREMBL}") + check_row_count(mock_xref_dbi, "xref", 0, f"info_type='DIRECT' AND source_id={SOURCE_ID_TREMBL_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 8, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_TREMBL_NON_DISPLAY}") + check_row_count(mock_xref_dbi, "xref", 49, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_PDB}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_STRING}") + check_row_count(mock_xref_dbi, "xref", 83, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_EMBL}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_BIOGRID}") + check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_CHEMBL}") + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_UNIPROT_GN}") + check_row_count(mock_xref_dbi, "xref", 83, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_PROTEIN_ID}") + check_row_count(mock_xref_dbi, "translation_direct_xref", 14) + check_row_count(mock_xref_dbi, "dependent_xref", 238) + check_row_count(mock_xref_dbi, "primary_xref", 13) + check_row_count(mock_xref_dbi, "synonym", 16) + + # Check the link between an xref and translation_direct_xref + check_direct_xref_link(mock_xref_dbi, "translation", "P62258", "ENSP00000461762") + check_direct_xref_link(mock_xref_dbi, "translation", "P31946-1", "ENSP00000361930") + + # Check the link between an xref and dependent_xref + master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='Q4F4R7' AND source_id={SOURCE_ID_TREMBL_NON_DISPLAY}")).scalar() + check_dependent_xref_link(mock_xref_dbi, "DQ305032", master_xref_id) + check_dependent_xref_link(mock_xref_dbi, "AGQ46203", master_xref_id) + master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='P62258' AND source_id={SOURCE_ID_SWISSPROT}")).scalar() + check_dependent_xref_link(mock_xref_dbi, "6EIH", master_xref_id) + + # Check the sequences for specific accessions + check_sequence(mock_xref_dbi, "Q04917", SOURCE_ID_SWISSPROT, "MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWEAGEGN") + check_sequence(mock_xref_dbi, "A0A7D5YZ42", SOURCE_ID_TREMBL_NON_DISPLAY, "LSKVYGPVFTLYFGLKPIVVLHGYEAVKEALIDLGEEFSGRGIFPLAERANRGFGIVFSNGKKWKEIRHFSLMTLRNFGMGKRSIEDRVQEEARCLVEELRKTKGG") + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "P62258", SOURCE_ID_UNIPROT_GN, "YWHAE1") + check_synonym(mock_xref_dbi, "P61981", SOURCE_ID_SWISSPROT, "P35214") + check_synonym(mock_xref_dbi, "P61981", SOURCE_ID_SWISSPROT, "Q9UDP2") + + # Check the release info + check_release(mock_xref_dbi, SOURCE_ID_SWISSPROT, "UniProtKB/Swiss-Prot Release 2024_03 of 29-May-2024") + check_release(mock_xref_dbi, SOURCE_ID_TREMBL, "UniProtKB/TrEMBL Release 2024_03 of 29-May-2024") \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_vgnc_parser.py b/src/python/test/xrefs/parsers/test_vgnc_parser.py new file mode 100644 index 000000000..6ebe58d8d --- /dev/null +++ b/src/python/test/xrefs/parsers/test_vgnc_parser.py @@ -0,0 +1,96 @@ +import pytest +import io +from unittest.mock import MagicMock +from typing import Callable + +from ensembl.production.xrefs.parsers.VGNCParser import VGNCParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link, check_synonym + +# Constants +SOURCE_ID_VGNC = 1 +SPECIES_ID_P_TROGLODYTES = 9598 + +# Fixture to create a VGNCParser instance +@pytest.fixture +def vgnc_parser() -> VGNCParser: + return VGNCParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(vgnc_parser: VGNCParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = vgnc_parser.run( + { + "source_id": SOURCE_ID_VGNC, + "species_id": SPECIES_ID_P_TROGLODYTES, + "file": "parsers/flatfiles/vgnc.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing VGNC data" + assert ( + f"Loaded a total of {expected_xrefs} VGNC xrefs and added {expected_synonyms} synonyms" in result_message + ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} VGNC xrefs and added {expected_synonyms} synonyms' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_vgnc_no_source_id(vgnc_parser: VGNCParser, test_no_source_id: Callable[[VGNCParser, int], None]) -> None: + test_no_source_id(vgnc_parser, SPECIES_ID_P_TROGLODYTES) + +def test_vgnc_no_species_id(vgnc_parser: VGNCParser, test_no_species_id: Callable[[VGNCParser, int], None]) -> None: + test_no_species_id(vgnc_parser, SOURCE_ID_VGNC) + +def test_vgnc_no_file(vgnc_parser: VGNCParser, test_no_file: Callable[[VGNCParser, int, int], None]) -> None: + test_no_file(vgnc_parser, SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) + +# Test case to check if an error is raised when the file is not found +def test_vgnc_file_not_found(vgnc_parser: VGNCParser, test_file_not_found: Callable[[VGNCParser, int, int], None]) -> None: + test_file_not_found(vgnc_parser, SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) + +# Test case to check if an error is raised when the file is empty +def test_vgnc_empty_file(vgnc_parser: VGNCParser, test_empty_file: Callable[[VGNCParser, str, int, int], None]) -> None: + test_empty_file(vgnc_parser, 'VGNC', SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) + +# Test case to check if an error is raised when required columns are missing +def test_missing_columns(vgnc_parser: VGNCParser, mock_xref_dbi: DBConnection) -> None: + mock_file = io.StringIO("taxon_id\tvgnc_id\tsymbol\tname\tlocus_group\tlocus_type\tstatus\tlocation\tlocation_sortable:\talias_symbol\talias_name\tprev_symbol\tprev_name\tgene_family\tgene_family_id\tdate_approved_reserved\tdate_symbol_changed\tdate_name_changed\tdate_modified\tentrez_id\tuniprot_ids\n") + vgnc_parser.get_filehandle = MagicMock(return_value=mock_file) + + with pytest.raises(ValueError, match="Can't find required columns in VGNC file"): + vgnc_parser.run( + { + "source_id": SOURCE_ID_VGNC, + "species_id": SPECIES_ID_P_TROGLODYTES, + "file": "dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + +# Test case to check successful parsing of valid VGNC data +def test_successful_parsing(mock_xref_dbi: DBConnection, vgnc_parser: VGNCParser) -> None: + vgnc_parser.species_id_to_taxonomy = MagicMock(return_value={}) + + # Run and validate parsing for VGNC file + run_and_validate_parsing(vgnc_parser, mock_xref_dbi, 6, 2) + + # Check the row counts in the xref, gene_direct_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_VGNC}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 6) + check_row_count(mock_xref_dbi, "synonym", 2) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "VGNC:14660", "ENSPTRG00000013870") + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "VGNC:14659", SOURCE_ID_VGNC, "TEST_SYNONYM") + check_synonym(mock_xref_dbi, "VGNC:3738", SOURCE_ID_VGNC, "DIP2") + + # Run and validate re-parsing for VGNC file + run_and_validate_parsing(vgnc_parser, mock_xref_dbi, 6, 2) + + # Check the row counts in the xref, gene_direct_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_VGNC}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 6) + check_row_count(mock_xref_dbi, "synonym", 2) diff --git a/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py new file mode 100644 index 000000000..dda0c7bdc --- /dev/null +++ b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py @@ -0,0 +1,78 @@ +import pytest +from typing import Callable + +from ensembl.production.xrefs.parsers.XenopusJamboreeParser import XenopusJamboreeParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link, check_description + +# Constants +SOURCE_ID_XENOPUS_JAMBOREE = 1 +SPECIES_ID_XENOPUS = 8364 + +# Fixture to create a XenopusJamboreeParser instance +@pytest.fixture +def xenopus_jamboree_parser() -> XenopusJamboreeParser: + return XenopusJamboreeParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(xenopus_jamboree_parser: XenopusJamboreeParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = xenopus_jamboree_parser.run( + { + "source_id": SOURCE_ID_XENOPUS_JAMBOREE, + "species_id": SPECIES_ID_XENOPUS, + "file": "parsers/flatfiles/xenopus_jamboree.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing Xenopus Jamboree data" + assert ( + f"{expected_xrefs} XenopusJamboree xrefs successfully parsed" in result_message + ), f"{prefix}Expected '{expected_xrefs} XenopusJamboree xrefs successfully parsed' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_xenopus_jamboree_no_source_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_source_id: Callable[[XenopusJamboreeParser, int], None]) -> None: + test_no_source_id(xenopus_jamboree_parser, SPECIES_ID_XENOPUS) + +def test_xenopus_jamboree_no_species_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_species_id: Callable[[XenopusJamboreeParser, int], None]) -> None: + test_no_species_id(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE) + +def test_xenopus_jamboree_no_file(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_file: Callable[[XenopusJamboreeParser, int, int], None]) -> None: + test_no_file(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) + +# Test case to check if an error is raised when the file is not found +def test_xenopus_jamboree_file_not_found(xenopus_jamboree_parser: XenopusJamboreeParser, test_file_not_found: Callable[[XenopusJamboreeParser, int, int], None]) -> None: + test_file_not_found(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) + +# Test case to check if an error is raised when the file is empty +def test_xenopus_jamboree_empty_file(xenopus_jamboree_parser: XenopusJamboreeParser, test_empty_file: Callable[[XenopusJamboreeParser, str, int, int], None]) -> None: + test_empty_file(xenopus_jamboree_parser, 'XenopusJamboree', SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) + +# Test case to check successful parsing of valid Xenopus Jamboree data +def test_successful_parsing(mock_xref_dbi: DBConnection, xenopus_jamboree_parser: XenopusJamboreeParser) -> None: + # Run and validate parsing for Xenopus Jamboree file + run_and_validate_parsing(xenopus_jamboree_parser, mock_xref_dbi, 12) + + # Check the row counts in the xref and gene_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 12, f"info_type='DIRECT' AND source_id={SOURCE_ID_XENOPUS_JAMBOREE}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 12) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "XB-GENE-478064", "ENSXETG00000005286") + check_direct_xref_link(mock_xref_dbi, "gene", "XB-GENE-478141", "ENSXETG00000025664") + + # Check if provenance information correctly removed from descriptions + check_description(mock_xref_dbi, "XB-GENE-940866", "receptor (chemosensory) transporter protein 3 gene C") + + # Check if "X of Y" labels correctly removed from descriptions + check_description(mock_xref_dbi, "XB-GENE-981482", "conserved hypothetical olfactory receptor") + + # Run and validate re-parsing for Xenopus Jamboree file + run_and_validate_parsing(xenopus_jamboree_parser, mock_xref_dbi, 12, "Re-parsing: ") + + # Check the row counts in the xref and gene_direct_xref tables + check_row_count(mock_xref_dbi, "xref", 12, f"info_type='DIRECT' AND source_id={SOURCE_ID_XENOPUS_JAMBOREE}") + check_row_count(mock_xref_dbi, "gene_direct_xref", 12) \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_zfin_desc_parser.py b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py new file mode 100644 index 000000000..1ef373c46 --- /dev/null +++ b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py @@ -0,0 +1,63 @@ +import pytest +from typing import Callable + +from ensembl.production.xrefs.parsers.ZFINDescParser import ZFINDescParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count + +# Constants +SOURCE_ID_ZFIN = 1 +SPECIES_ID_ZEBRAFISH = 7955 + +# Fixture to create a ZFINDescParser instance +@pytest.fixture +def zfin_desc_parser() -> ZFINDescParser: + return ZFINDescParser(True) + +# Function to run and validate the parsing process +def run_and_validate_parsing(zfin_desc_parser: ZFINDescParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_withdrawn: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = zfin_desc_parser.run( + { + "source_id": SOURCE_ID_ZFIN, + "species_id": SPECIES_ID_ZEBRAFISH, + "file": "parsers/flatfiles/zfin_desc.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing ZFINDesc data" + assert ( + f"{expected_xrefs} ZFINDesc xrefs added" in result_message + ), f"{prefix}Expected '{expected_xrefs} ZFINDesc xrefs added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_withdrawn} withdrawn entries ignored" in result_message + ), f"{prefix}Expected '{expected_withdrawn} withdrawn entries ignored' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_zfin_desc_no_source_id(zfin_desc_parser: ZFINDescParser, test_no_source_id: Callable[[ZFINDescParser, int], None]) -> None: + test_no_source_id(zfin_desc_parser, SPECIES_ID_ZEBRAFISH) + +def test_zfin_desc_no_species_id(zfin_desc_parser: ZFINDescParser, test_no_species_id: Callable[[ZFINDescParser, int], None]) -> None: + test_no_species_id(zfin_desc_parser, SOURCE_ID_ZFIN) + +def test_zfin_desc_no_file(zfin_desc_parser: ZFINDescParser, test_no_file: Callable[[ZFINDescParser, int, int], None]) -> None: + test_no_file(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + +# Test case to check if an error is raised when the file is not found +def test_zfin_desc_file_not_found(zfin_desc_parser: ZFINDescParser, test_file_not_found: Callable[[ZFINDescParser, int, int], None]) -> None: + test_file_not_found(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + +# Test case to check if an error is raised when the file is empty +def test_zfin_desc_empty_file(zfin_desc_parser: ZFINDescParser, test_empty_file: Callable[[ZFINDescParser, str, int, int], None]) -> None: + test_empty_file(zfin_desc_parser, 'ZFINDesc', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + +# Test case to check successful parsing of valid ZFINDesc data +def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_desc_parser: ZFINDescParser) -> None: + # Run and validate parsing for ZFINDesc file + run_and_validate_parsing(zfin_desc_parser, mock_xref_dbi, 6, 3) + + # Check the row counts in the xref table + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='MISC' AND source_id={SOURCE_ID_ZFIN}") \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_zfin_parser.py b/src/python/test/xrefs/parsers/test_zfin_parser.py new file mode 100644 index 000000000..060ffa2bc --- /dev/null +++ b/src/python/test/xrefs/parsers/test_zfin_parser.py @@ -0,0 +1,165 @@ +import pytest +from unittest.mock import MagicMock +from typing import Callable +from sqlalchemy import text + +from ensembl.production.xrefs.parsers.ZFINParser import ZFINParser +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_synonym, check_description + +# Constants +SOURCE_ID_ZFIN = 1 +SOURCE_ID_DIRECT = 2 +SOURCE_ID_DEPENDENT = 3 +SOURCE_ID_DESCRIPTION = 4 +SOURCE_ID_UNIPROT = 5 +SOURCE_ID_REFSEQ = 6 +SPECIES_ID_ZEBRAFISH = 7955 + +# Fixture to create a ZFINParser instance +@pytest.fixture +def zfin_parser() -> ZFINParser: + return ZFINParser(True) + +# Function to populate the database with ZFIN Desc, Uniprot, and RefSeq xrefs +def populate_xref_db(mock_xref_dbi: DBConnection): + source_data = [ + [SOURCE_ID_DESCRIPTION, 'ZFIN_ID', 10, 'description_only'], + [SOURCE_ID_DIRECT, 'ZFIN_ID', 1, 'direct'], + [SOURCE_ID_DEPENDENT, 'ZFIN_ID', 2, 'uniprot/refseq'], + [SOURCE_ID_UNIPROT, 'Uniprot/SWISSPROT', 20, ''], + [SOURCE_ID_REFSEQ, 'RefSeq_dna', 15, ''], + ] + for row in source_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO source (source_id, name, ordered, priority_description) + VALUES (:source_id, :name, :ordered, :priority_description) + """ + ), + { + "source_id": row[0], + "name": row[1], + "ordered": row[2], + "priority_description": row[3] + } + ) + + xref_data = [ + [1, 'ZDB-GENE-000125-4', SOURCE_ID_DESCRIPTION, SPECIES_ID_ZEBRAFISH, 'MISC', 'deltaC'], + [2, 'ZDB-GENE-000201-9', SOURCE_ID_DESCRIPTION, SPECIES_ID_ZEBRAFISH, 'MISC', 'anosmin 1a'], + [3, 'ZDB-GENE-000128-18', SOURCE_ID_DESCRIPTION, SPECIES_ID_ZEBRAFISH, 'MISC', 'anoctamin 1'], + [4, 'A0A8M9PP76', SOURCE_ID_UNIPROT, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''], + [5, 'B2GNV2', SOURCE_ID_UNIPROT, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''], + [6, 'Q9PTU1', SOURCE_ID_UNIPROT, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''], + [7, 'NP_571533', SOURCE_ID_REFSEQ, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''], + [8, 'NM_131458', SOURCE_ID_REFSEQ, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''], + ] + for row in xref_data: + mock_xref_dbi.execute( + text( + """ + INSERT INTO xref (xref_id, accession, source_id, species_id, info_type, description) + VALUES (:xref_id, :accession, :source_id, :species_id, :info_type, :description) + """ + ), + { + "xref_id": row[0], + "accession": row[1], + "source_id": row[2], + "species_id": row[3], + "info_type": row[4], + "description": row[5] + } + ) + + mock_xref_dbi.commit() + +# Function to run and validate the parsing process +def run_and_validate_parsing(zfin_parser: ZFINParser, mock_xref_dbi: DBConnection, expected_direct_xrefs: int, expected_uniprot_xrefs: int, expected_refseq_xref: int, expected_mismatch: int, expected_synonyms: int, prefix: str = None) -> None: + if prefix is None: + prefix = "" + + result_code, result_message = zfin_parser.run( + { + "source_id": SOURCE_ID_ZFIN, + "species_id": SPECIES_ID_ZEBRAFISH, + "file": "parsers/flatfiles/zfin/dummy_file.txt", + "xref_dbi": mock_xref_dbi, + } + ) + + assert result_code == 0, f"{prefix}Errors when parsing ZFIN data" + assert ( + f"{expected_direct_xrefs} direct ZFIN xrefs added and" in result_message + ), f"{prefix}Expected '{expected_direct_xrefs} direct ZFIN xrefs added and' in result_message, but got: '{result_message}'" + assert ( + f"{expected_uniprot_xrefs} dependent xrefs from UniProt added" in result_message + ), f"{prefix}Expected '{expected_uniprot_xrefs} dependent xrefs from UniProt added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_refseq_xref} dependent xrefs from RefSeq added" in result_message + ), f"{prefix}Expected '{expected_refseq_xref} dependent xrefs from RefSeq added' in result_message, but got: '{result_message}'" + assert ( + f"{expected_mismatch} dependents ignored" in result_message + ), f"{prefix}Expected '{expected_mismatch} dependents ignored' in result_message, but got: '{result_message}'" + assert ( + f"{expected_synonyms} synonyms loaded" in result_message + ), f"{prefix}Expected '{expected_synonyms} synonyms loaded' in result_message, but got: '{result_message}'" + +# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file +def test_zfin_no_source_id(zfin_parser: ZFINParser, test_no_source_id: Callable[[ZFINParser, int], None]) -> None: + test_no_source_id(zfin_parser, SPECIES_ID_ZEBRAFISH) + +def test_zfin_no_species_id(zfin_parser: ZFINParser, test_no_species_id: Callable[[ZFINParser, int], None]) -> None: + test_no_species_id(zfin_parser, SOURCE_ID_ZFIN) + +def test_zfin_no_file(zfin_parser: ZFINParser, test_no_file: Callable[[ZFINParser, int, int], None]) -> None: + test_no_file(zfin_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + +# Test case to check if an error is raised when the required source_id is missing +def test_zfin_missing_required_source_id(zfin_parser: ZFINParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[ZFINParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(zfin_parser, mock_xref_dbi, 'ZFIN_ID', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH, 'direct') + +# Test case to check if an error is raised when the file is not found +def test_zfin_file_not_found(zfin_parser: ZFINParser, test_file_not_found: Callable[[ZFINParser, int, int], None]) -> None: + test_file_not_found(zfin_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + +# Test case to check if an error is raised when the file is empty +def test_dbass_empty_file(zfin_parser: ZFINParser, test_empty_file: Callable[[ZFINParser, str, int, int], None]) -> None: + test_empty_file(zfin_parser, 'ZFIN Ensembl', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + +# Test case to check successful parsing +def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_parser: ZFINParser) -> None: + populate_xref_db(mock_xref_dbi) + + # Check the row counts in the xref before running the parser + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='MISC' AND source_id={SOURCE_ID_DESCRIPTION}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_UNIPROT}") + check_row_count(mock_xref_dbi, "xref", 2, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ}") + + # Run and validate parsing for ZFIN files + run_and_validate_parsing(zfin_parser, mock_xref_dbi, 10, 3, 2, 9, 5) + + # Check the row counts in the xref, dependent_xref, and synonym tables + check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}") + check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_DEPENDENT}") + check_row_count(mock_xref_dbi, "dependent_xref", 5) + check_row_count(mock_xref_dbi, "synonym", 7) + + # Check the link between an xref and gene_direct_xref + check_direct_xref_link(mock_xref_dbi, "gene", "ZDB-GENE-000125-4", "ENSDARG00000002336") + + # Check the link between an xref and dependent_xref + check_dependent_xref_link(mock_xref_dbi, "ZDB-GENE-000128-18", 5) + check_dependent_xref_link(mock_xref_dbi, "ZDB-GENE-000128-18", 6) + check_dependent_xref_link(mock_xref_dbi, "ZDB-GENE-000201-96", 7) + + # Check the synonyms for specific accessions + check_synonym(mock_xref_dbi, "ZDB-GENE-000125-12", SOURCE_ID_DIRECT, "Df(LG03)") + check_synonym(mock_xref_dbi, "ZDB-GENE-000128-18", SOURCE_ID_DEPENDENT, "Tg(NBT:MAPT-GFP)") + + # Check the descriptions for specific accessions + check_description(mock_xref_dbi, "ZDB-GENE-000125-4", "deltaC") + check_description(mock_xref_dbi, "ZDB-GENE-000201-9", "anosmin 1a") + check_description(mock_xref_dbi, "ZDB-GENE-000128-18", "anoctamin 1") diff --git a/src/python/test/xrefs/pytest.ini b/src/python/test/xrefs/pytest.ini new file mode 100644 index 000000000..b79469489 --- /dev/null +++ b/src/python/test/xrefs/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --verbose --tb=line \ No newline at end of file diff --git a/src/python/test/xrefs/test_helpers.py b/src/python/test/xrefs/test_helpers.py new file mode 100644 index 000000000..efe1e35d4 --- /dev/null +++ b/src/python/test/xrefs/test_helpers.py @@ -0,0 +1,80 @@ +from sqlalchemy import text + +from ensembl.utils.database import UnitTestDB, DBConnection + +# Helper function to check the row count in a specific table +def check_row_count(db: DBConnection, table: str, expected_count: int, where_clause: str = None) -> None: + sql = f"SELECT COUNT(*) FROM {table}" + if where_clause is not None: + sql += f" WHERE {where_clause}" + + row_count = db.execute(text(sql)).scalar() + assert ( + row_count == expected_count + ), f"Expected {expected_count} rows in {table} table (WHERE: {where_clause or ''}), but got {row_count}" + +# Helper function to check the synonym for a specific accession +def check_synonym(db: DBConnection, accession: str, source_id: int, expected_synonym: str) -> None: + synonym = db.execute( + text( + f"SELECT s.synonym FROM synonym s, xref x WHERE s.xref_id=x.xref_id AND x.accession='{accession}' AND x.source_id={source_id} AND s.synonym='{expected_synonym}'" + ) + ).scalar() + assert ( + synonym == expected_synonym + ), f"Expected synonym '{expected_synonym}' for accession '{accession}', but got '{synonym}'" + +# Helper function to check the direct xref connection for a specific accession +def check_direct_xref_link(db: DBConnection, type: str, accession: str, expected_stable_id: str) -> None: + stable_id = db.execute( + text( + f"SELECT d.ensembl_stable_id FROM {type}_direct_xref d, xref x WHERE d.general_xref_id=x.xref_id AND x.accession='{accession}' AND d.ensembl_stable_id='{expected_stable_id}'" + ) + ).scalar() + assert ( + stable_id == expected_stable_id + ), f"Expected link between accession '{accession}' and EnsEMBL stable ID '{expected_stable_id}', but got '{stable_id}'" + +# Helper function to check the dependent xref connection for a specific accession +def check_dependent_xref_link(db: DBConnection, accession: str, expected_master_xref_id: str) -> None: + master_xref_id = db.execute( + text( + f"SELECT d.master_xref_id FROM dependent_xref d, xref x WHERE d.dependent_xref_id=x.xref_id AND x.accession='{accession}' AND d.master_xref_id={expected_master_xref_id}" + ) + ).scalar() + assert ( + master_xref_id == expected_master_xref_id + ), f"Expected link between accession '{accession}' and master xref ID '{expected_master_xref_id}', but got '{master_xref_id}'" + +# Helper function to check the sequence for a specific accession +def check_sequence(db: DBConnection, accession: str, source_id: int, expected_sequence: str) -> None: + sequence = db.execute( + text( + f"SELECT p.sequence FROM primary_xref p, xref x WHERE p.xref_id=x.xref_id AND x.accession='{accession}' AND x.source_id={source_id}" + ) + ).scalar() + assert ( + sequence == expected_sequence + ), f"Expected sequence '{expected_sequence}' for accession '{accession}', but got '{sequence}'" + +# Helper function to check the description for a specific accession +def check_description(db: DBConnection, accession: str, expected_description: str) -> None: + description = db.execute( + text( + f"SELECT description FROM xref WHERE accession='{accession}'" + ) + ).scalar() + assert ( + description == expected_description + ), f"Expected description '{expected_description}' for accession '{accession}', but got '{description}'" + +# Helper function to check the release info for a specific source_id +def check_release(db: DBConnection, source_id: str, expected_release: str) -> None: + release = db.execute( + text( + f"SELECT source_release FROM source WHERE source_id={source_id}" + ) + ).scalar() + assert ( + release == expected_release + ), f"Expected release info '{expected_release}' for source_id {source_id}, but got '{release}'" \ No newline at end of file From 182f2513350478f81b6b019185ee5df23d5f0cc1 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Fri, 8 Nov 2024 13:26:39 +0000 Subject: [PATCH 03/12] New tests and fixes --- src/python/test/xrefs/conftest.py | 150 +++++--- .../xrefs/flatfiles/RNACentral-md5.tsv.gz | Bin 0 -> 475 bytes .../test/xrefs/flatfiles/UniParc-upidump.lis | 10 + src/python/test/xrefs/flatfiles/config.ini | 88 +++++ src/python/test/xrefs/flatfiles/peptides.fa | 200 ++++++++++ src/python/test/xrefs/flatfiles/sources.json | 16 + .../xrefs/parsers/test_arrayexpress_parser.py | 8 +- .../test/xrefs/parsers/test_ccds_parser.py | 8 +- .../test/xrefs/parsers/test_dbass_parser.py | 12 +- .../xrefs/parsers/test_entrezgene_parser.py | 16 +- .../test/xrefs/parsers/test_hgnc_parser.py | 19 +- .../test/xrefs/parsers/test_hpa_parser.py | 12 +- .../xrefs/parsers/test_jgi_protein_parser.py | 12 +- .../xrefs/parsers/test_mgi_desc_parser.py | 12 +- .../test/xrefs/parsers/test_mgi_parser.py | 12 +- .../xrefs/parsers/test_mim2gene_parser.py | 12 +- .../test/xrefs/parsers/test_mim_parser.py | 16 +- .../test/xrefs/parsers/test_mirbase_parser.py | 12 +- .../xrefs/parsers/test_reactome_parser.py | 27 +- .../test/xrefs/parsers/test_refseq_parser.py | 14 +- .../test/xrefs/parsers/test_rfam_parser.py | 12 +- .../test/xrefs/parsers/test_rgd_parser.py | 14 +- .../test/xrefs/parsers/test_ucsc_parser.py | 12 +- .../test/xrefs/parsers/test_uniprot_parser.py | 14 +- .../test/xrefs/parsers/test_vgnc_parser.py | 12 +- .../parsers/test_xenopus_jamboree_parser.py | 12 +- .../xrefs/parsers/test_zfin_desc_parser.py | 22 +- .../test/xrefs/parsers/test_zfin_parser.py | 12 +- src/python/test/xrefs/pytest.ini | 3 + src/python/test/xrefs/test_checksum.py | 104 +++++ src/python/test/xrefs/test_download_source.py | 118 ++++++ src/python/test/xrefs/test_helpers.py | 25 +- .../test/xrefs/test_schedule_alignment.py | 106 ++++++ .../test/xrefs/test_schedule_cleanup.py | 116 ++++++ .../test/xrefs/test_schedule_download.py | 116 ++++++ src/python/test/xrefs/test_schedule_parse.py | 224 +++++++++++ .../test/xrefs/test_schedule_species.py | 358 ++++++++++++++++++ 37 files changed, 1694 insertions(+), 242 deletions(-) create mode 100644 src/python/test/xrefs/flatfiles/RNACentral-md5.tsv.gz create mode 100644 src/python/test/xrefs/flatfiles/UniParc-upidump.lis create mode 100644 src/python/test/xrefs/flatfiles/config.ini create mode 100644 src/python/test/xrefs/flatfiles/peptides.fa create mode 100644 src/python/test/xrefs/flatfiles/sources.json create mode 100644 src/python/test/xrefs/test_checksum.py create mode 100644 src/python/test/xrefs/test_download_source.py create mode 100644 src/python/test/xrefs/test_schedule_alignment.py create mode 100644 src/python/test/xrefs/test_schedule_cleanup.py create mode 100644 src/python/test/xrefs/test_schedule_download.py create mode 100644 src/python/test/xrefs/test_schedule_parse.py create mode 100644 src/python/test/xrefs/test_schedule_species.py diff --git a/src/python/test/xrefs/conftest.py b/src/python/test/xrefs/conftest.py index 36b690013..e1067414e 100644 --- a/src/python/test/xrefs/conftest.py +++ b/src/python/test/xrefs/conftest.py @@ -1,83 +1,103 @@ import pytest -import os import io import re +import os +import importlib from datetime import datetime from unittest.mock import MagicMock -from typing import Any, Generator, Callable +from typing import Any, Generator, Callable, Dict from ensembl.utils.database import UnitTestDB, DBConnection -from ensembl.xrefs.xref_update_db_model import Base +from ensembl.xrefs.xref_update_db_model import Base as BaseUpdateORM +from ensembl.xrefs.xref_source_db_model import Base as BaseSourceORM from ensembl.production.xrefs.parsers.BaseParser import BaseParser -# Fixture to set up a test database +# Adding custom command-line options to pytest +def pytest_addoption(parser): + parser.addoption( + "--test_db_url", + action="store", + default=os.getenv("TEST_DB_URL"), + help="MySQL URL to use for the test databases", + ) + parser.addoption( + "--test_scratch_path", + action="store", + default=os.getenv("TEST_SCRATCH_PATH"), + help="Path to a scratch directory to use for temporary files", + ) + +# Fixture to set up a xref test database @pytest.fixture(scope="module") -def test_db() -> Generator[None, None, None]: - # Create a unique database name using the current user and timestamp - user = os.environ.get("USER", "testuser") +def test_xref_db(pytestconfig: pytest.Config) -> Generator[UnitTestDB, None, None]: + # Retrieve the test DB URL + test_db_url = pytestconfig.getoption("test_db_url") + if not test_db_url: + raise ValueError(f"DB URL for test database must be provided") + + # Create a unique database name using the timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - db_name = f"{user}_test_xref_{timestamp}" - mysql_url = f"mysql+pymysql://ensadmin:ensembl@mysql-ens-core-prod-1.ebi.ac.uk:4524/{db_name}" + db_name = f"test_xref_update_{timestamp}" + full_test_db_url = f"{test_db_url}/{db_name}" # Create all tables defined in the Base metadata - with UnitTestDB(mysql_url, metadata=Base.metadata, name=db_name) as test_db: + with UnitTestDB(full_test_db_url, metadata=BaseUpdateORM.metadata, name=db_name) as test_db: yield test_db -# Fixture to connect to the test database and close connection when done +# Fixture to connect to the xref test database and close connection when done @pytest.fixture -def mock_xref_dbi(test_db: UnitTestDB) -> Generator[Any, None, None]: - conn = test_db.dbc.connect() +def mock_xref_dbi(test_xref_db) -> Generator[Any, None, None]: + conn = test_xref_db.dbc.connect() yield conn conn.close() -# Common test for missing source_id -@pytest.fixture -def test_no_source_id() -> Callable[[BaseParser, int], None]: - def _test_no_source_id(parser_instance: BaseParser, species_id: int = 9606) -> None: - with pytest.raises( - AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?" - ): - parser_instance.run( - { - "species_id": species_id, - "file": "dummy_file.txt", - "xref_dbi": MagicMock(), - } - ) - return _test_no_source_id +# Fixture to set up a source test database +@pytest.fixture(scope="module") +def test_source_db(pytestconfig: pytest.Config) -> Generator[UnitTestDB, None, None]: + # Retrieve the test DB URL + test_db_url = pytestconfig.getoption("test_db_url") + if not test_db_url: + raise ValueError(f"DB URL for test database must be provided") + + # Create a unique database name using the timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + db_name = f"test_xref_source_{timestamp}" + full_test_db_url = f"{test_db_url}/{db_name}" + + # Create all tables defined in the Base metadata + with UnitTestDB(full_test_db_url, metadata=BaseSourceORM.metadata, name=db_name) as test_db: + yield test_db -# Common test for missing species_id +# Fixture to connect to the source test database and close connection when done @pytest.fixture -def test_no_species_id() -> Callable[[BaseParser, int], None]: - def _test_no_species_id(parser_instance: BaseParser, source_id: int = 1) -> None: - with pytest.raises( - AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?" - ): - parser_instance.run( - { - "source_id": source_id, - "file": "dummy_file.txt", - "xref_dbi": MagicMock(), - } - ) - return _test_no_species_id +def mock_source_dbi(test_source_db) -> Generator[Any, None, None]: + conn = test_source_db.dbc.connect() + yield conn + conn.close() + +# @pytest.fixture +# def mock_source_db_url(test_source_db): +# return test_source_db.dbc.url -# Common test for missing file +# Common test for missing argument @pytest.fixture -def test_no_file() -> Callable[[BaseParser, int, int], None]: - def _test_no_file(parser_instance: BaseParser, source_id: int = 1, species_id: int = 9606) -> None: +def test_parser_missing_argument() -> Callable[[BaseParser, str, int, int], None]: + def _test_parser_missing_argument(parser_instance: BaseParser, arg_name: str, source_id: int = 1, species_id: int = 9606) -> None: + parser_args = { + "source_id": source_id, + "species_id": species_id, + "file": "dummy_file.txt", + "xref_dbi": MagicMock(), + } + if arg_name in parser_args: + del parser_args[arg_name] + with pytest.raises( - AttributeError, match="Missing required arguments: source_id, species_id, and file" + AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?" ): - parser_instance.run( - { - "source_id": source_id, - "species_id": species_id, - "xref_dbi": MagicMock(), - } - ) - return _test_no_file + parser_instance.run(parser_args) + return _test_parser_missing_argument # Common test for file not found @pytest.fixture @@ -132,4 +152,24 @@ def _test_missing_required_source_id(parser_instance: BaseParser, mock_dbi: DBCo "xref_dbi": mock_dbi, } ) - return _test_missing_required_source_id \ No newline at end of file + return _test_missing_required_source_id + +# Common test for missing required parameter +@pytest.fixture +def test_missing_required_param() -> Callable[[str, Dict[str, Any], str], None]: + def _test_missing_required_param(module_name: str, args: Dict[str, Any], param_name: str) -> None: + # Remove the param name being tested from the args + current_args = args.copy() + if param_name in current_args: + del current_args[param_name] + + # Import the module and create an instance + module = importlib.import_module(f"ensembl.production.xrefs.{module_name}") + module_class = getattr(module, module_name) + module_object = module_class(current_args, True, True) + + with pytest.raises( + AttributeError, match=f"Parameter '{param_name}' is required but has no value" + ): + module_object.run() + return _test_missing_required_param \ No newline at end of file diff --git a/src/python/test/xrefs/flatfiles/RNACentral-md5.tsv.gz b/src/python/test/xrefs/flatfiles/RNACentral-md5.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..1057790692375462aa40c6527c4357bcf2c19b90 GIT binary patch literal 475 zcmV<10VMt(iwFpfg(GJG15!>wLuGDsa$#&OZDchrbaQq9Rg>G1)-Vi4->G6t(9M$k z2rLu;vjteP)qe?y;SaOUB;l#xuA9dk?}d0HrJPr zPXIN5b*SKQ@}jNVz*kN`Gb(V~@(bVwECO-EFsf=bAtB3SA_HO#l?#vtfFaG}K*=OG zia7}9Mxz6%<5n0CFh_8V2p1X}P-k%>oWtwRpbroVQk0t<&|HJ8pMolFUx2+@LR;-0 zEEb)al(2ZhS}MEr0{o@m=ngA4R$HOrE@!eVn@sZp=1YOlch^GHFvgHA4iH@cY`cK3 zPpzdxb*!kR6ly%_NbP|>#|34C=R`mCK?CC|j$8rJRA4_~Pv1aYwjv>w} zcse|ux)t7EmPnP-ZO78-G*K~Eu}J8S3;6M}w7O6jT2ukCRr@4&kGyl=wf=mTZ>@sS zHEs~zE-AV@Nu4(M89;b4bp38N0%vKjwkxfWnizcT65{_1d2$#_hGc4_ba8`W0$sa= z^ki`3G6Fgy7jg003C4-VXo( literal 0 HcmV?d00001 diff --git a/src/python/test/xrefs/flatfiles/UniParc-upidump.lis b/src/python/test/xrefs/flatfiles/UniParc-upidump.lis new file mode 100644 index 000000000..eb5e9b96e --- /dev/null +++ b/src/python/test/xrefs/flatfiles/UniParc-upidump.lis @@ -0,0 +1,10 @@ +UPI00018273A1 0346D0CAE142F3B4BEAB03C043F946C2 +UPI00159B34E5 3F21C95F3E901F0CFF99DD2B7AB9E0FB +UPI001BFFF51E AA7F616AF8BBF601C1119851CD1E7D81 +UPI000BB13401 65B4E5BA3C7D523EE05D11D7D93246C7 +UPI00058EFAFA E209A7008D4E2643C34CAA4659995FB9 +UPI000012AB42 E933F38C54E844615DF09466B8372C27 +UPI0010249251 376E4A5FA9F5E0161E03F3168BFC91A8 +UPI0001F8B5D5 6B2B77E4162A054DA364AB5D0F4DFC3B +UPI000CA31F98 F6466B0375E6205467BE0F78FB708040 +UPI000809885D 972EED30B388FBFF1F446ABDC2CD57CD \ No newline at end of file diff --git a/src/python/test/xrefs/flatfiles/config.ini b/src/python/test/xrefs/flatfiles/config.ini new file mode 100644 index 000000000..1bc1619e6 --- /dev/null +++ b/src/python/test/xrefs/flatfiles/config.ini @@ -0,0 +1,88 @@ +[source ArrayExpress::MULTI] +name = ArrayExpress +order = 50 +priority = 1 +prio_descr = multi +parser = ArrayExpressParser + +[source UniParc::MULTI] +name = UniParc +order = 20 +priority = 1 +prio_descr = multi +parser = ChecksumParser + +[source DBASS3::homo_sapiens] +name = DBASS3 +order = 50 +priority = 1 +prio_descr = human +parser = DBASSParser + +[source MIM::homo_sapiens] +name = MIM +order = 10 +priority = 1 +prio_descr = human +parser = MIMParser + +[source Reactome::MULTI] +name = Reactome +order = 80 +priority = 1 +prio_descr = multi +parser = ReactomeParser + +[source RefSeq_dna::MULTI-vertebrate] +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = verts +parser = RefSeqParser + +[source RefSeq_dna::gencode] +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = human +parser = RefSeqParser + +[source RefSeq_dna::MULTI-Plants] +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = plants +parser = RefSeqParser + +[source RefSeq_peptide::gencode] +name = RefSeq_peptide +order = 30 +priority = 2 +prio_descr = human +parser = RefSeqParser + +[source RefSeq_peptide::MULTI-Plants] +name = RefSeq_peptide +order = 25 +priority = 2 +prio_descr = plants +parser = RefSeqParser + +[source RefSeq_peptide::MULTI-vertebrate] +name = RefSeq_peptide +order = 25 +priority = 2 +prio_descr = verts +parser = RefSeqParser + +[species vertebrates] +taxonomy_id = 7742 +sources = ArrayExpress::MULTI,UniParc::MULTI,Reactome::MULTI,RefSeq_dna::MULTI-vertebrate,RefSeq_peptide::MULTI-vertebrate + +[species homo_sapiens] +taxonomy_id = 9606 +sources = DBASS3::homo_sapiens,MIM::homo_sapiens,RefSeq_dna::gencode,RefSeq_peptide::gencode + +[species plants] +taxonomy_id = 33090 +sources = ArrayExpress::MULTI,UniParc::MULTI,Reactome::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_peptide::MULTI-Plants diff --git a/src/python/test/xrefs/flatfiles/peptides.fa b/src/python/test/xrefs/flatfiles/peptides.fa new file mode 100644 index 000000000..210faceec --- /dev/null +++ b/src/python/test/xrefs/flatfiles/peptides.fa @@ -0,0 +1,200 @@ +>1 +MFMINILMLIIPILLAVAFLTLVERKVLGYMQLRKGPNVVGPYGLLQPIADAIKLFIKEP +LRPATSSASMFILAPIMALGLALTMWIPLPMPYPLINMNLGVLFMLAMSSLAVYSILWSG +WASNSKYALIGALRAVAQTISYEVTLAIILLSVVLMSGSFTLSTLITTQEQMWLILPAWP +LAMMWFISTLAETNRAPFDLTEGESELVSGFNVEYAAGPFALFFMAEYANIIMMNIFTAI +LFLGTSHNPHMPELYTINFTIKSLLLTMSFLWIRASYPRFRYDQLMHLLWKNFLPLTLAL +CMWHVSLPILTSGIPPQT +>2 +MNPIIFIIILLTIMLGTIIVMISSHWLLVWIGFEMNMLAIIPIMMKNHNPRATEASTKYF +LTQSTASMLLMMAVIINVMFSGQWTVMKLFSPMASMLMTMALAMKLGMAPFHFWVPEVTQ +GIPLSSGLILLTWQKLAPMSVLYQIFPSINLNLILTLSVLSILIGGWGGLNQTQLRKIMA +YSSIAHMGWMTAVLPYNPTMTLLNLIIYIIMTSTMFTMFMANSTTTTLSLSHTWNKTPIM +TVLILATLLSMGGLPPLSGFMPKWMIIQEMTKNNSIILPTFMAITALLNLYFYMRLTYST +TLTMFPSTNNMKMKWQFPLMKKMTFLPTMVVLSTMMLPLTPMLSVLE +>3 +MFINRWLFSTSHKDIGTLYLLFDAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNAVVTA +HAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEA +GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMSQYQ +TPLFVWSVMITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH +PEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVD +TRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMMWALGFIFLFTVGGLTGIVLAN +SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFVHWFPLFSGYTLNDTWAKIHFAIMFVG +VNMTFFPQHFLGLSGMPRRYSDYPDAYTMWNTISSMGSFISLTAVMLMVFIIWEAFASKR +EVLTVDLTTTNLEWLNGCPPPYHTFEEPTYVNLK +>4 +MAYPMQLGFQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE +VETIWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLSFDS +YMIPTSELKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLN +QTTLMSSRPGLYYGQCSEICGSNHSFMPIVLELVPLKYFEKWSASML +>5 +MPQLDTSTWLTMILSMFLTLFIIFQLKVSKHNFYHNPELTPTKMLKQNTPWETKWTKIYL +PLLLPL +>6 +MNENLFTSFITPVILGLPLVTLIVLFPSLLFPTSNRLVSNRFVTLQQWMLQLVSKQMMSI +HNSKGQTWTLMLMSLILFIGSTNLLGLLPHSFTPTTQLSMNLGMAIPLWAGAVITGFRNK +TKASLAHFLPQGTPTPLIPMLVIIETISLFIQPMALAVRLTANITAGHLLIHLIGGATLA +LMSISTTTALITFTILILLTILEFAVAMIQAYVFTLLVSLYLHDNT +>7 +MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIGLTTNMLTMYQWWRD +VIRESTFQGHHTPAVQKGLRYGMILFIISEVLFFTGFFWAFYHSSLAPTPELGGCWPPTG +IHPLNPLEVPLLNTSVLLASGVSITWAHHSLMEGDRKHMLQALFITITLGVYFTLLQASE +YYEAPFTISDGVYGSTFFVATGFHGLHVIIGSTFLIVCFFRQLKFHFTSNHHFGFEAAAW +YWHFVDVVWLFLYVSIYWWG +>8 +MNLMLALLTNFTLATLLVIIAFWLPQLNVYSEKTSPYECGFDPMGSARLPFSMKFFLVAI +TFLLFDLEIALLLPLPWASQTANLNTMLTMALFLIILLAVSLAYEWTQKGLEWTE +>9 +MSMVYMNIMMAFTVSLVGLLMYRSHLMSSLLCLEGMMLSLFVMAALTILNSHFTLASMMP +IILLVFAACEAALGLSLLVMVSNTYGTDYVQNLNLLQC +>10 +MLKYIIPTIMLMPLTWLSKNNMIWVNSTAHSLLISFTSLLLMNQFGDNSLNFSLLFFSDS +LSTPLLILTMWLLPLMLMASQHHLSKENLTRKKLFITMLISLQLFLIMTFTAMELILFYI +LFEATLVPTLIIITRWGNQTERLNAGLYFLFYTLAGSLPLLVALIYIQNTVGSLNFLMLQ +YWVQPVHNSWSNVFMWLACMMAFMVKMPLYGLHLWLPKAHVEAPIAGSMVLAAVLLKLGG +YGMLRITLILNPMTDFMAYPFIMLSLWGMIMTSSICLRQTDLKSLIAYSSVSHMALVIVA +ILIQTPWSYMGATALMIAHGLTSSMLFCLANSNYERIHSRTMILARGLQTLLPLMATWWL +LASLTNLALPPTINLIGELFVVMSTFSWSNITIILMGVNMVITALYSLYMLIMTQRGKYT +YHINNISPSFTRENALMSLHILPLLLLTLNPKIILGPLY +>11 +MNMFSSLSLVTLLLLTTPIMMMSFNTYKPSNYPLYVKTAISYAFITSMIPTMMFIHSGQE +LIISNWHWLTIQTLKLSLSFKMDYFSMMFIPVALFVTWSIMEFSMWYMYSDPNINKFFKY +LLLFLITMLILVTANNLFQLFIGWEGVGIMSFLLIGWWYGRADANTAALQAILYNRIGDI +GFILAMAWFLTNLNTWDLQQIFMLNPSDSNMPLIGLALAATGKSAQFGLHPWLPSAMEGP +TPVSALLHSSTMVVAGIFLLIRFYPLTENNKYIQSITLCLGAITTLFTAMCALTQNDIKK +IIAFSTSSQLGLMMVTIGINQPYLAFLHICTHAFFKAMLFMCSGSIIHSLNDEQDIRKMG +GLFKAMPFTTTALIVGSLALTGMPFLTGFYSKDLIIEAANTSYTNAWALLMTLIATSFTA +IYSTRIIFFALLGQPRFPTLVNINENNPLLINSIKRLLIGSLFAGYIISNNIPPTTIPQM +TMPYYLKTTALIVTILGFILALEISNMTKNLKYHYPSNAFKFSTLLGYFPTIMHRLAPYM +NLSMSQKSASSLLDLIWLEAILPKTISLAQMKASTLVTNQKGLIKLYFLSFLITILISMI +LFNFHE +>12 +MMLYIVFILSVIFVMGFVGFSSKPSPIYGGLGLIVSGGVGCGIVLNFGGSFLGLMVFLIY +LGGMMVVFGYTTAMATEQYPEIWLSNKAVLGAFVTGLLMEFFMVYYVLKDKEVEVVFEFN +GLGDWVIYDTGDSGFFSEEAMGIAALYSYGTWLVIVTGWSLLIGVVVIMEITRGN +>13 +MTNIRKSHPLMKIVNNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTT +TAFSSVTHICRDVNYGWIIRYMHANGASMFFICLYMHVGRGLYYGSYTFLETWNIGVILL +LTVMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTNLVEWIWGGFSVDKATLTRFFA +FHFILPFIIMAIAMVHLLFLHETGSNNPTGISSDVDKIPFHPYYTIKDILGALLLILALM +LLVLFAPDLLGDPDNYTPANPLNTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAFSILI +LALIPLLHTSKQRSMMFRPLSQCLFWALVADLLTLTWIGGQPVEHPYITIGQLASVLYFL +LILVLMPTAGTIENKLLKW +>14 +MKDFLGWLERFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQL +NLEGKNYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLK +NADLCIASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEF +TVRDGYIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTER +MYLCLSQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVP +VVESLQLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGW +RWVRQPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNT +NSEGSYTNVSTNSTSVTSSTATVVS +>15 +MREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVYLMGSGWKKKKEQMERDG +CSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSDKRKHFMLSVKMFYGNSD +DIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLRSQTVSTRYLHVEGGNFH +ASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSVTGMALPRLIIRKVDKQT +ALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQATPCPKEPNKEMINDGASWTII +STDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVAMLELTGQNFTPNLRVWFGDVEA +ETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPR +PHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNSTSVTSSTATVVS +>16 +MDQMEGSPAEEPPAHAPSLGKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSY +GNEKRFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGK +NYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLC +IASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDG +YIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCL +SQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESL +QLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQ +PVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGS +YTNVSTNSTSVTSSTATVVS +>17 +MNEKGWELKGAGSHLENTHLRRARPKTRITGALPMDQMEGSPAEEPPAHAPSLGKFGERP +PPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVYLMGSGWKKKKEQ +MERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSDKRKHFMLSVKMF +YGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLRSQTVSTRYLHVE +GGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSVTGMALPRLIIRK +VDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQATPCPKEPNKEMINDGA +SWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVAMLELTGQNFTPNLRVWF +GDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLVRNDGIIYSTSLTFTYTP +EPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNSTSVTSSTATVVS +>18 +MLHRLAPGTPSGVSTRRQTLRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKS +YGNEKRFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEG +KNYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADL +CIASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRD +GYIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLC +LSQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVES +LQLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVR +QPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEG +SYTNVSTNSTSVTSSTATVVS +>19 +MAWIKRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVY +LMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSD +KRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLR +SQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSV +TGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQATPCP +KEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVAMLELT +GQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLVRNDGI +IYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNSTSVTSS +TATVVS +>20 +MAWIKRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVY +LMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSD +KRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLDDDESEGEEFTVRDG +YIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCL +SQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESL +QLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQ +PVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGS +YTNVSTNSTSVTSSTATVVS +>21 +MAWIKRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVY +LMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSD +KRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLR +SQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSV +TGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQLNGGG +DVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPV +TLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVST +NSTSVTSSTATVVS +>22 +MEGCLPTHHTLPEKHLYAHWLLQRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVA +QKSYGNEKRFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLN +LEGKNYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKN +ADLCIASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFT +VRDGYIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERM +YLCLSQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPV +VESLQLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWR +WVRQPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTN +SEGSYTNVSTNSTSVTSSTATVVS +>23 +MIGLLYPALSRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCP +PPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLY +ISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVAL +FNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVK +LVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQ +ATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVA +MLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLV +RNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNST +SVTSSTATVVS +>24 +MDVVDSLLMNESNLTPPCELGIENETLFCLDQPHPSKEWQPAVQILLYSLIFLLSVLGNT +LVITVLIRNKRMRTVTNIFLLSLAVSDLMLCLFCMPFNLIPNLLKDFIFGSAVCKTTTYF +MGTSVSVSTFNLVAISLERYGAICKPLQSRVWQTKSHALKVIAATWCLSFTIMTPYPIYS +NLVPFTKNNNQTANMCRFLLPSDVMQQSWHTFLLLILFLIPGIVMMVAYGLISLELYQGI +KFDASQKKSARERKRSSASSGRYPHGQEARHPHAHGHRGPLLPVLDAHLQRQRLEGL +>25 +MDVVDSLLMNESNLTPPCELGIENETLFCLDQPHPSKEWQPAVQILLYSLIFLLSVLGNT +LVITVLIRNKRMRTVTNIFLLSLAVSDLMLCLFCMPFNLIPNLLKDFIFGSAVCKTTTYF +MGTSVSVSTFNLVAISLERYGAICKPLQSRVWQTKSHALKVIAATWCLSFTIMTPYPIYS +NLVPFTKNNNQTANMCRFLLPSDVMQQSWHTFLLLILFLIPGIVMMVAYGLISLELYQGI +KFDASQKKSARERKRSSASSGRYADSAGCCLQRPKHPRKLELRQLSTGSAGRADRIRSSS +PAASLMAKKRVIRMLMVIVVLFFLCWMPIFSANAWRAFDTASAERRLSGTPIAFILLLSY +TSSCVNPIIYCFMNKRIVEAALRLRSPSLFQEHSVTTHLTMTTDGNRKQTLFWPFSVLQT +SRSKGEL +>26 +MLQEESDLSLIIAQIVQKLKGSNLYAQLERQAWASLQRPEIKLESLKEDIKEFFKISGWE +KKLQNAVYSELSVFPLPSHPAAPPEHLKEPLVYMRKAQGSWEKRILKSLNSMCTELSIPL +ARKRPVGEQKELLNKWNEMGTDEPDLSLFRPVYAPKDFLEVLINLRNPNYESGDSLSFRT +HLGLIQVPLKVKDIPELKEFFVELGLTTGQLGIDDSTQVPPELFENEHVRIGQKVLTQQD +SAAAQQYIRQGSPTALRAELWALILNISSHPEDILYYEQLKTNVIQHDLLVDSLIYKDVK +LTASNDDYYFVFEDYLYQVLLCFSRDTSVLSHFAYNSASPPKSYIRGKLGLEEYAVF diff --git a/src/python/test/xrefs/flatfiles/sources.json b/src/python/test/xrefs/flatfiles/sources.json new file mode 100644 index 000000000..1b45a2acb --- /dev/null +++ b/src/python/test/xrefs/flatfiles/sources.json @@ -0,0 +1,16 @@ +[ + { + "name" : "ArrayExpress", + "parser" : "ArrayExpressParser", + "file" : "Database", + "db" : "core", + "priority" : 1 + }, + { + "name" : "RNACentral", + "parser" : "ChecksumParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz", + "db" : "checksum", + "priority" : 1 + } +] \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_arrayexpress_parser.py b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py index db0379e08..b73fd037c 100644 --- a/src/python/test/xrefs/parsers/test_arrayexpress_parser.py +++ b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py @@ -37,11 +37,9 @@ def run_and_validate_parsing(arrayexpress_parser: ArrayExpressParser, mock_xref_ ), f"{prefix}Expected 'Added {expected_xrefs} DIRECT xrefs' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id and species_id -def test_arrayexpress_no_source_id(arrayexpress_parser: ArrayExpressParser, test_no_source_id: Callable[[ArrayExpressParser, int], None]) -> None: - test_no_source_id(arrayexpress_parser, SPECIES_ID_HUMAN) - -def test_arrayexpress_no_species_id(arrayexpress_parser: ArrayExpressParser, test_no_species_id: Callable[[ArrayExpressParser, int], None]) -> None: - test_no_species_id(arrayexpress_parser, SOURCE_ID_ARRAYEXPRESS) +def test_arrayexpress_missing_argument(arrayexpress_parser: ArrayExpressParser, test_parser_missing_argument: Callable[[ArrayExpressParser, str, int, int], None]) -> None: + test_parser_missing_argument(arrayexpress_parser, "source_id", SOURCE_ID_ARRAYEXPRESS, SPECIES_ID_HUMAN) + test_parser_missing_argument(arrayexpress_parser, "species_id", SOURCE_ID_ARRAYEXPRESS, SPECIES_ID_HUMAN) # Test case to check if parsing is skipped when no species name can be found def test_no_species_name(mock_xref_dbi: DBConnection, arrayexpress_parser: ArrayExpressParser) -> None: diff --git a/src/python/test/xrefs/parsers/test_ccds_parser.py b/src/python/test/xrefs/parsers/test_ccds_parser.py index 1f7fe93e9..4b22225ef 100644 --- a/src/python/test/xrefs/parsers/test_ccds_parser.py +++ b/src/python/test/xrefs/parsers/test_ccds_parser.py @@ -36,11 +36,9 @@ def run_and_validate_parsing(ccds_parser: CCDSParser, mock_xref_dbi: DBConnectio ), f"{prefix}Expected 'Parsed CCDS identifiers, added {expected_xrefs} xrefs and {expected_direct_xrefs} direct_xrefs' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id and species_id -def test_ccds_no_source_id(ccds_parser: CCDSParser, test_no_source_id: Callable[[CCDSParser, int], None]) -> None: - test_no_source_id(ccds_parser, SPECIES_ID_HUMAN) - -def test_ccds_no_species_id(ccds_parser: CCDSParser, test_no_species_id: Callable[[CCDSParser, int], None]) -> None: - test_no_species_id(ccds_parser, SOURCE_ID_CCDS) +def test_ccds_missing_argument(ccds_parser: CCDSParser, test_parser_missing_argument: Callable[[CCDSParser, str, int, int], None]) -> None: + test_parser_missing_argument(ccds_parser, "source_id", SOURCE_ID_CCDS, SPECIES_ID_HUMAN) + test_parser_missing_argument(ccds_parser, "species_id", SOURCE_ID_CCDS, SPECIES_ID_HUMAN) # Test case to check if an error is raised when no CCDS database is provided def test_no_ccds_db(ccds_parser: CCDSParser) -> None: diff --git a/src/python/test/xrefs/parsers/test_dbass_parser.py b/src/python/test/xrefs/parsers/test_dbass_parser.py index c6ec23967..8b19caf55 100644 --- a/src/python/test/xrefs/parsers/test_dbass_parser.py +++ b/src/python/test/xrefs/parsers/test_dbass_parser.py @@ -41,14 +41,10 @@ def run_and_validate_parsing(dbass_parser: DBASSParser, mock_xref_dbi: DBConnect ), f"{prefix}Expected 'Skipped {expected_skipped_xrefs} unmapped xrefs' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_dbass_no_source_id(dbass_parser: DBASSParser, test_no_source_id: Callable[[DBASSParser, int], None]) -> None: - test_no_source_id(dbass_parser, SPECIES_ID_HUMAN) - -def test_dbass_no_species_id(dbass_parser: DBASSParser, test_no_species_id: Callable[[DBASSParser, int], None]) -> None: - test_no_species_id(dbass_parser, SOURCE_ID_DBASS3) - -def test_dbass_no_file(dbass_parser: DBASSParser, test_no_file: Callable[[DBASSParser, int, int], None]) -> None: - test_no_file(dbass_parser, SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) +def test_dbass_missing_argument(dbass_parser: DBASSParser, test_parser_missing_argument: Callable[[DBASSParser, str, int, int], None]) -> None: + test_parser_missing_argument(dbass_parser, "source_id", SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) + test_parser_missing_argument(dbass_parser, "species_id", SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) + test_parser_missing_argument(dbass_parser, "file", SOURCE_ID_DBASS3, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the file is not found def test_dbass_file_not_found(dbass_parser: DBASSParser, test_file_not_found: Callable[[DBASSParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_entrezgene_parser.py b/src/python/test/xrefs/parsers/test_entrezgene_parser.py index f0d31f8ae..32709b79a 100644 --- a/src/python/test/xrefs/parsers/test_entrezgene_parser.py +++ b/src/python/test/xrefs/parsers/test_entrezgene_parser.py @@ -38,14 +38,10 @@ def run_and_validate_parsing(entrezgene_parser: EntrezGeneParser, mock_xref_dbi: ), f"{prefix}Expected '{expected_entrez_xrefs} EntrezGene Xrefs and {expected_wiki_xrefs} WikiGene Xrefs added with {expected_synonyms} synonyms' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_entrezgene_no_source_id(entrezgene_parser: EntrezGeneParser, test_no_source_id: Callable[[EntrezGeneParser, int], None]) -> None: - test_no_source_id(entrezgene_parser, SPECIES_ID_HUMAN) - -def test_entrezgene_no_species_id(entrezgene_parser: EntrezGeneParser, test_no_species_id: Callable[[EntrezGeneParser, int], None]) -> None: - test_no_species_id(entrezgene_parser, SOURCE_ID_ENTREZGENE) - -def test_entrezgene_no_file(entrezgene_parser: EntrezGeneParser, test_no_file: Callable[[EntrezGeneParser, int, int], None]) -> None: - test_no_file(entrezgene_parser, SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) +def test_entrezgene_missing_argument(entrezgene_parser: EntrezGeneParser, test_parser_missing_argument: Callable[[EntrezGeneParser, str, int, int], None]) -> None: + test_parser_missing_argument(entrezgene_parser, "source_id", SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) + test_parser_missing_argument(entrezgene_parser, "species_id", SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) + test_parser_missing_argument(entrezgene_parser, "file", SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the file is not found def test_entrezgene_file_not_found(entrezgene_parser: EntrezGeneParser, test_file_not_found: Callable[[EntrezGeneParser, int, int], None]) -> None: @@ -111,6 +107,10 @@ def test_malformed_headers(entrezgene_parser: EntrezGeneParser, header: str) -> } ) +# Test case to check if an error is raised when the required source_id is missing +def test_entrezgene_missing_required_source_id(entrezgene_parser: EntrezGeneParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[EntrezGeneParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(entrezgene_parser, mock_xref_dbi, 'WikiGene', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN) + # Test case to check if an error is raised when the file has insufficient columns def test_insufficient_columns(entrezgene_parser: EntrezGeneParser) -> None: mock_file = io.StringIO() diff --git a/src/python/test/xrefs/parsers/test_hgnc_parser.py b/src/python/test/xrefs/parsers/test_hgnc_parser.py index 7f920779f..f09bae291 100644 --- a/src/python/test/xrefs/parsers/test_hgnc_parser.py +++ b/src/python/test/xrefs/parsers/test_hgnc_parser.py @@ -66,14 +66,15 @@ def run_and_validate_parsing(hgnc_parser: HGNCParser, mock_xref_dbi: DBConnectio ), f"{prefix}Expected '{expected_mismatch} HGNC ids could not be associated in xrefs' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_hgnc_no_source_id(hgnc_parser: HGNCParser, test_no_source_id: Callable[[HGNCParser, int], None]) -> None: - test_no_source_id(hgnc_parser, SPECIES_ID_HUMAN) +def test_hgnc_missing_argument(hgnc_parser: HGNCParser, test_parser_missing_argument: Callable[[HGNCParser, str, int, int], None]) -> None: + test_parser_missing_argument(hgnc_parser, "source_id", SOURCE_ID_HGNC, SPECIES_ID_HUMAN) + test_parser_missing_argument(hgnc_parser, "species_id", SOURCE_ID_HGNC, SPECIES_ID_HUMAN) + test_parser_missing_argument(hgnc_parser, "file", SOURCE_ID_HGNC, SPECIES_ID_HUMAN) -def test_hgnc_no_species_id(hgnc_parser: HGNCParser, test_no_species_id: Callable[[HGNCParser, int], None]) -> None: - test_no_species_id(hgnc_parser, SOURCE_ID_HGNC) - -def test_hgnc_no_file(hgnc_parser: HGNCParser, test_no_file: Callable[[HGNCParser, int, int], None]) -> None: - test_no_file(hgnc_parser, SOURCE_ID_HGNC, SPECIES_ID_HUMAN) +# Test case to check if an error is raised when the required source_id is missing +def test_hgnc_missing_required_source_id(hgnc_parser: HGNCParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[HGNCParser, DBConnection, str, int, int, str], None]) -> None: + hgnc_parser.get_source_name_for_source_id = MagicMock(return_value="HGNC") + test_missing_required_source_id(hgnc_parser, mock_xref_dbi, 'HGNC', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'ccds') # Test case to check if an error is raised when no CCDS database is provided def test_no_ccds_db(hgnc_parser: HGNCParser) -> None: @@ -106,7 +107,7 @@ def test_successful_parsing_without_existing_xrefs(mock_xref_dbi: DBConnection, hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url") hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={}) - hgnc_parser.get_valid_codes = MagicMock(return_value={}) + hgnc_parser.get_acc_to_xref_ids = MagicMock(return_value={}) hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={}) # Run and validate parsing for HGNC file @@ -132,7 +133,7 @@ def test_successful_parsing_with_existing_xrefs(mock_xref_dbi: DBConnection, hgn hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name) hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url") hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={"CCDS12976": "CCDS12976", "CCDS8856": "CCDS8856", "CCDS53797": "CCDS53797"}) - hgnc_parser.get_valid_codes = MagicMock(return_value={"NM_130786": [12], "NR_026971": [34, 56], "NR_015380": [78], "NM_001088": [90]}) + hgnc_parser.get_acc_to_xref_ids = MagicMock(return_value={"NM_130786": [12], "NR_026971": [34, 56], "NR_015380": [78], "NM_001088": [90]}) hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={"503538": 123, "441376": 456, "51146": 789}) # Run and validate parsing for HGNC file diff --git a/src/python/test/xrefs/parsers/test_hpa_parser.py b/src/python/test/xrefs/parsers/test_hpa_parser.py index 838a3756e..04b860382 100644 --- a/src/python/test/xrefs/parsers/test_hpa_parser.py +++ b/src/python/test/xrefs/parsers/test_hpa_parser.py @@ -37,14 +37,10 @@ def run_and_validate_parsing(hpa_parser: HPAParser, mock_xref_dbi: DBConnection, ), f"{prefix}Expected '{expected_xrefs} direct xrefs successfully parsed' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_hpa_no_source_id(hpa_parser: HPAParser, test_no_source_id: Callable[[HPAParser, int], None]) -> None: - test_no_source_id(hpa_parser, SPECIES_ID_HUMAN) - -def test_hpa_no_species_id(hpa_parser: HPAParser, test_no_species_id: Callable[[HPAParser, int], None]) -> None: - test_no_species_id(hpa_parser, SOURCE_ID_HPA) - -def test_hpa_no_file(hpa_parser: HPAParser, test_no_file: Callable[[HPAParser, int, int], None]) -> None: - test_no_file(hpa_parser, SOURCE_ID_HPA, SPECIES_ID_HUMAN) +def test_hpa_missing_argument(hpa_parser: HPAParser, test_parser_missing_argument: Callable[[HPAParser, str, int, int], None]) -> None: + test_parser_missing_argument(hpa_parser, "source_id", SOURCE_ID_HPA, SPECIES_ID_HUMAN) + test_parser_missing_argument(hpa_parser, "species_id", SOURCE_ID_HPA, SPECIES_ID_HUMAN) + test_parser_missing_argument(hpa_parser, "file", SOURCE_ID_HPA, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the file is not found def test_hpa_file_not_found(hpa_parser: HPAParser, test_file_not_found: Callable[[HPAParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_jgi_protein_parser.py b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py index 666e6fa95..d936b7da9 100644 --- a/src/python/test/xrefs/parsers/test_jgi_protein_parser.py +++ b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py @@ -34,14 +34,10 @@ def run_and_validate_parsing(jgi_protein_parser: JGI_ProteinParser, mock_xref_db assert f"{expected_xrefs} JGI_ xrefs successfully parsed" in result_message, f"{prefix}Expected '{expected_xrefs} JGI_ xrefs successfully parsed' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_jgi_no_source_id(jgi_protein_parser: JGI_ProteinParser, test_no_source_id: Callable[[JGI_ProteinParser, int], None]) -> None: - test_no_source_id(jgi_protein_parser, SPECIES_ID_C_INTESTINALIS) - -def test_jgi_no_species_id(jgi_protein_parser: JGI_ProteinParser, test_no_species_id: Callable[[JGI_ProteinParser, int], None]) -> None: - test_no_species_id(jgi_protein_parser, SOURCE_ID_JGI) - -def test_jgi_no_file(jgi_protein_parser: JGI_ProteinParser, test_no_file: Callable[[JGI_ProteinParser, int, int], None]) -> None: - test_no_file(jgi_protein_parser, SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) +def test_jgi_missing_argument(jgi_protein_parser: JGI_ProteinParser, test_parser_missing_argument: Callable[[JGI_ProteinParser, str, int, int], None]) -> None: + test_parser_missing_argument(jgi_protein_parser, "source_id", SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) + test_parser_missing_argument(jgi_protein_parser, "species_id", SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) + test_parser_missing_argument(jgi_protein_parser, "file", SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS) # Test case to check if an error is raised when the file is not found def test_jgi_file_not_found(jgi_protein_parser: JGI_ProteinParser, test_file_not_found: Callable[[JGI_ProteinParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_mgi_desc_parser.py b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py index 02b46352b..cedf77c3e 100644 --- a/src/python/test/xrefs/parsers/test_mgi_desc_parser.py +++ b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py @@ -40,14 +40,10 @@ def run_and_validate_parsing(mgi_desc_parser: MGIDescParser, mock_xref_dbi: DBCo ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_mgi_desc_no_source_id(mgi_desc_parser: MGIDescParser, test_no_source_id: Callable[[MGIDescParser, int], None]) -> None: - test_no_source_id(mgi_desc_parser, SPECIES_ID_MOUSE) - -def test_mgi_desc_no_species_id(mgi_desc_parser: MGIDescParser, test_no_species_id: Callable[[MGIDescParser, int], None]) -> None: - test_no_species_id(mgi_desc_parser, SOURCE_ID_MGI_DESC) - -def test_mgi_desc_no_file(mgi_desc_parser: MGIDescParser, test_no_file: Callable[[MGIDescParser, int, int], None]) -> None: - test_no_file(mgi_desc_parser, SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) +def test_mgi_desc_missing_argument(mgi_desc_parser: MGIDescParser, test_parser_missing_argument: Callable[[MGIDescParser, DBConnection, str, int, int, str], None]) -> None: + test_parser_missing_argument(mgi_desc_parser, "source_id", SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) + test_parser_missing_argument(mgi_desc_parser, "species_id", SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) + test_parser_missing_argument(mgi_desc_parser, "file", SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE) # Test case to check if an error is raised when the file is not found def test_mgi_desc_file_not_found(mgi_desc_parser: MGIDescParser, test_file_not_found: Callable[[MGIDescParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_mgi_parser.py b/src/python/test/xrefs/parsers/test_mgi_parser.py index fab933d60..5897556cc 100644 --- a/src/python/test/xrefs/parsers/test_mgi_parser.py +++ b/src/python/test/xrefs/parsers/test_mgi_parser.py @@ -38,14 +38,10 @@ def run_and_validate_parsing(mgi_parser: MGIParser, mock_xref_dbi: DBConnection, ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_mgi_no_source_id(mgi_parser: MGIParser, test_no_source_id: Callable[[MGIParser, int], None]) -> None: - test_no_source_id(mgi_parser, SPECIES_ID_MOUSE) - -def test_mgi_no_species_id(mgi_parser: MGIParser, test_no_species_id: Callable[[MGIParser, int], None]) -> None: - test_no_species_id(mgi_parser, SOURCE_ID_MGI) - -def test_mgi_no_file(mgi_parser: MGIParser, test_no_file: Callable[[MGIParser, int, int], None]) -> None: - test_no_file(mgi_parser, SOURCE_ID_MGI, SPECIES_ID_MOUSE) +def test_mgi_missing_argument(mgi_parser: MGIParser, test_parser_missing_argument: Callable[[MGIParser, str, int, int], None]) -> None: + test_parser_missing_argument(mgi_parser, "source_id", SOURCE_ID_MGI, SPECIES_ID_MOUSE) + test_parser_missing_argument(mgi_parser, "species_id", SOURCE_ID_MGI, SPECIES_ID_MOUSE) + test_parser_missing_argument(mgi_parser, "file", SOURCE_ID_MGI, SPECIES_ID_MOUSE) # Test case to check if an error is raised when the file is not found def test_mgi_file_not_found(mgi_parser: MGIParser, test_file_not_found: Callable[[MGIParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_mim2gene_parser.py b/src/python/test/xrefs/parsers/test_mim2gene_parser.py index 590c1c3bc..8dc2d284a 100644 --- a/src/python/test/xrefs/parsers/test_mim2gene_parser.py +++ b/src/python/test/xrefs/parsers/test_mim2gene_parser.py @@ -115,14 +115,10 @@ def run_and_validate_parsing(mim2gene_parser: Mim2GeneParser, mock_xref_dbi: DBC ), f"{prefix}Expected '{expected_missed_master} had missing master entries' in result message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_mim2gene_no_source_id(mim2gene_parser: Mim2GeneParser, test_no_source_id: Callable[[Mim2GeneParser, int], None]) -> None: - test_no_source_id(mim2gene_parser, SPECIES_ID_HUMAN) - -def test_mim2gene_no_species_id(mim2gene_parser: Mim2GeneParser, test_no_species_id: Callable[[Mim2GeneParser, int], None]) -> None: - test_no_species_id(mim2gene_parser, SOURCE_ID_MIM2GENE) - -def test_mim2gene_no_file(mim2gene_parser: Mim2GeneParser, test_no_file: Callable[[Mim2GeneParser, int, int], None]) -> None: - test_no_file(mim2gene_parser, SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) +def test_mim2gene_missing_argument(mim2gene_parser: Mim2GeneParser, test_parser_missing_argument: Callable[[Mim2GeneParser, str, int, int], None]) -> None: + test_parser_missing_argument(mim2gene_parser, "source_id", SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) + test_parser_missing_argument(mim2gene_parser, "species_id", SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) + test_parser_missing_argument(mim2gene_parser, "file", SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the file is not found def test_mim2gene_file_not_found(mim2gene_parser: Mim2GeneParser, test_file_not_found: Callable[[Mim2GeneParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_mim_parser.py b/src/python/test/xrefs/parsers/test_mim_parser.py index 676c182bf..ce5b4c187 100644 --- a/src/python/test/xrefs/parsers/test_mim_parser.py +++ b/src/python/test/xrefs/parsers/test_mim_parser.py @@ -52,14 +52,14 @@ def run_and_validate_parsing(mim_parser: MIMParser, mock_xref_dbi: DBConnection, ), f"{prefix}Expected '{expected_removed_entries} entries removed' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_mim_no_source_id(mim_parser: MIMParser, test_no_source_id: Callable[[MIMParser, int], None]) -> None: - test_no_source_id(mim_parser, SPECIES_ID_HUMAN) - -def test_mim_no_species_id(mim_parser: MIMParser, test_no_species_id: Callable[[MIMParser, int], None]) -> None: - test_no_species_id(mim_parser, SOURCE_ID_MIM) - -def test_mim_no_file(mim_parser: MIMParser, test_no_file: Callable[[MIMParser, int, int], None]) -> None: - test_no_file(mim_parser, SOURCE_ID_MIM, SPECIES_ID_HUMAN) +def test_mim_missing_argument(mim_parser: MIMParser, test_parser_missing_argument: Callable[[MIMParser, str, int, int], None]) -> None: + test_parser_missing_argument(mim_parser, "source_id", SOURCE_ID_MIM, SPECIES_ID_HUMAN) + test_parser_missing_argument(mim_parser, "species_id", SOURCE_ID_MIM, SPECIES_ID_HUMAN) + test_parser_missing_argument(mim_parser, "file", SOURCE_ID_MIM, SPECIES_ID_HUMAN) + +# Test case to check if an error is raised when the required source_id is missing +def test_mim_missing_required_source_id(mim_parser: MIMParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[MIMParser, DBConnection, str, int, int, str], None]) -> None: + test_missing_required_source_id(mim_parser, mock_xref_dbi, 'MIM_GENE', SOURCE_ID_MIM, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the file is not found def test_mim_file_not_found(mim_parser: MIMParser, test_file_not_found: Callable[[MIMParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_mirbase_parser.py b/src/python/test/xrefs/parsers/test_mirbase_parser.py index f9c426c3a..61ae37028 100644 --- a/src/python/test/xrefs/parsers/test_mirbase_parser.py +++ b/src/python/test/xrefs/parsers/test_mirbase_parser.py @@ -39,14 +39,10 @@ def run_and_validate_parsing(mirbase_parser: miRBaseParser, mock_xref_dbi: DBCon ), f"{prefix}Expected 'Read {expected_xrefs} xrefs from' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_mirbase_no_source_id(mirbase_parser: miRBaseParser, test_no_source_id: Callable[[miRBaseParser, int], None]) -> None: - test_no_source_id(mirbase_parser, SPECIES_ID_C_ELEGANS) - -def test_mirbase_no_species_id(mirbase_parser: miRBaseParser, test_no_species_id: Callable[[miRBaseParser, int], None]) -> None: - test_no_species_id(mirbase_parser, SOURCE_ID_MIRBASE) - -def test_mirbase_no_file(mirbase_parser: miRBaseParser, test_no_file: Callable[[miRBaseParser, int, int], None]) -> None: - test_no_file(mirbase_parser, SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS) +def test_mirbase_missing_argument(mirbase_parser: miRBaseParser, test_parser_missing_argument: Callable[[miRBaseParser, str, int, int], None]) -> None: + test_parser_missing_argument(mirbase_parser, "source_id", SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS) + test_parser_missing_argument(mirbase_parser, "species_id", SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS) + test_parser_missing_argument(mirbase_parser, "file", SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS) # Test case to check if an error is raised when the file is not found def test_mirbase_file_not_found(mirbase_parser: miRBaseParser, test_file_not_found: Callable[[miRBaseParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_reactome_parser.py b/src/python/test/xrefs/parsers/test_reactome_parser.py index 9187fde0e..92e18e0f1 100644 --- a/src/python/test/xrefs/parsers/test_reactome_parser.py +++ b/src/python/test/xrefs/parsers/test_reactome_parser.py @@ -46,6 +46,8 @@ def populate_xref_db(mock_xref_dbi: DBConnection): } ) + mock_xref_dbi.commit() + # Function to run and validate the parsing process def run_and_validate_parsing(reactome_parser: ReactomeParser, mock_xref_dbi: DBConnection, file: str, expected_processed: int, expected_dependent: int, expected_direct: int, expected_errors: int, prefix: str = None) -> None: if prefix is None: @@ -77,14 +79,10 @@ def run_and_validate_parsing(reactome_parser: ReactomeParser, mock_xref_dbi: DBC ), f"{prefix}Expected '{expected_errors} not found' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_reactome_no_source_id(reactome_parser: ReactomeParser, test_no_source_id: Callable[[ReactomeParser, int], None]) -> None: - test_no_source_id(reactome_parser, SPECIES_ID_HUMAN) - -def test_reactome_no_species_id(reactome_parser: ReactomeParser, test_no_species_id: Callable[[ReactomeParser, int], None]) -> None: - test_no_species_id(reactome_parser, SOURCE_ID_REACTOME) - -def test_reactome_no_file(reactome_parser: ReactomeParser, test_no_file: Callable[[ReactomeParser, int, int], None]) -> None: - test_no_file(reactome_parser, SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) +def test_reactome_missing_argument(reactome_parser: ReactomeParser, test_parser_missing_argument: Callable[[ReactomeParser, str, int, int], None]) -> None: + test_parser_missing_argument(reactome_parser, "source_id", SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) + test_parser_missing_argument(reactome_parser, "species_id", SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) + test_parser_missing_argument(reactome_parser, "file", SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) # Test case to check if parsing is skipped when no species name can be found def test_no_species_name(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: @@ -117,31 +115,30 @@ def test_reactome_empty_file(reactome_parser: ReactomeParser, test_empty_file: C reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]}) test_empty_file(reactome_parser, 'Reactome', SOURCE_ID_REACTOME, SPECIES_ID_HUMAN) -# Test case to check successful parsing of valid Reactome data without existing uniprot xrefs -def test_successful_parsing_without_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: +# Test case to check successful parsing of valid Reactome data +def test_successful_parsing(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: populate_xref_db(mock_xref_dbi) # Run and validate parsing for Uniprot and Ensembl Reactome files run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 0, 0, 0) run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_ensembl", 14, 0, 13, 1) - # Check the row counts in the xref and direct_xref tables + # Check the row counts in the xref, direct_xref, and dependent_xref tables check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_GENE}") check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_TRANSCRIPT}") check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_DIRECT}") check_row_count(mock_xref_dbi, "gene_direct_xref", 6) check_row_count(mock_xref_dbi, "transcript_direct_xref", 4) check_row_count(mock_xref_dbi, "translation_direct_xref", 3) + check_row_count(mock_xref_dbi, "dependent_xref", 0) # Check the link between an xref and direct_xref tables check_direct_xref_link(mock_xref_dbi, "gene", "R-HSA-1643685", "ENSG00000000419") check_direct_xref_link(mock_xref_dbi, "transcript", "R-HSA-199991", "ENST00000000233") check_direct_xref_link(mock_xref_dbi, "translation", "R-HSA-199991", "ENSP00000000233") -# Test case to check successful parsing of valid Reactome data with existing uniprot xrefs -def test_successful_parsing_with_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None: - populate_xref_db(mock_xref_dbi) - reactome_parser.get_valid_codes = MagicMock(return_value={"A0A075B6P5": [12], "A0A075B6S6" : [34, 56], "A0A087WPF7": [78], "A0A096LNF2": [90]}) + # Add uniptot xrefs + reactome_parser.get_acc_to_xref_ids = MagicMock(return_value={"A0A075B6P5": [12], "A0A075B6S6" : [34, 56], "A0A087WPF7": [78], "A0A096LNF2": [90]}) # Run and validate re-parsing for Uniprot and Ensembl Reactome files run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 6, 0, 0, "Re-parsing: ") diff --git a/src/python/test/xrefs/parsers/test_refseq_parser.py b/src/python/test/xrefs/parsers/test_refseq_parser.py index 2b8a77f2c..6f9c5f1f1 100644 --- a/src/python/test/xrefs/parsers/test_refseq_parser.py +++ b/src/python/test/xrefs/parsers/test_refseq_parser.py @@ -142,17 +142,13 @@ def run_and_validate_parsing(refseq_parser: RefSeqParser, mock_xref_dbi: DBConne ), f"{prefix}Expected 'WikiGene\t{wiki}' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_refseq_no_source_id(refseq_parser: RefSeqParser, test_no_source_id: Callable[[RefSeqParser, int], None]) -> None: - test_no_source_id(refseq_parser, SPECIES_ID_HUMAN) - -def test_refseq_no_species_id(refseq_parser: RefSeqParser, test_no_species_id: Callable[[RefSeqParser, int], None]) -> None: - test_no_species_id(refseq_parser, SOURCE_ID_REFSEQ_MRNA) - -def test_refseq_no_file(refseq_parser: RefSeqParser, test_no_file: Callable[[RefSeqParser, int, int], None]) -> None: - test_no_file(refseq_parser, SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) +def test_refseq_missing_argument(refseq_parser: RefSeqParser, test_parser_missing_argument: Callable[[RefSeqParser, str, int, int], None]) -> None: + test_parser_missing_argument(refseq_parser, "source_id", SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) + test_parser_missing_argument(refseq_parser, "species_id", SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) + test_parser_missing_argument(refseq_parser, "file", SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the required source_id is missing -def test_mim2gene_missing_required_source_id(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RefSeqParser, DBConnection, str, int, int, str], None]) -> None: +def test_refseq_missing_required_source_id(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RefSeqParser, DBConnection, str, int, int, str], None]) -> None: test_missing_required_source_id(refseq_parser, mock_xref_dbi, 'RefSeq_peptide', SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN) # Test case to check if parsing is skipped when no species name can be found diff --git a/src/python/test/xrefs/parsers/test_rfam_parser.py b/src/python/test/xrefs/parsers/test_rfam_parser.py index 86caa9669..5d9c780ba 100644 --- a/src/python/test/xrefs/parsers/test_rfam_parser.py +++ b/src/python/test/xrefs/parsers/test_rfam_parser.py @@ -37,14 +37,10 @@ def run_and_validate_parsing(rfam_parser: RFAMParser, mock_xref_dbi: DBConnectio ), f"{prefix}Expected 'Added {expected_xrefs} RFAM xrefs and {expected_direct_xrefs} direct xrefs' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_rfam_no_source_id(rfam_parser: RFAMParser, test_no_source_id: Callable[[RFAMParser, int], None]) -> None: - test_no_source_id(rfam_parser, SPECIES_ID_HUMAN) - -def test_rfam_no_species_id(rfam_parser: RFAMParser, test_no_species_id: Callable[[RFAMParser, int], None]) -> None: - test_no_species_id(rfam_parser, SOURCE_ID_RFAM) - -def test_rfam_no_file(rfam_parser: RFAMParser, test_no_file: Callable[[RFAMParser, int, int], None]) -> None: - test_no_file(rfam_parser, SOURCE_ID_RFAM, SPECIES_ID_HUMAN) +def test_rfam_missing_argument(rfam_parser: RFAMParser, test_parser_missing_argument: Callable[[RFAMParser, int, int], None]) -> None: + test_parser_missing_argument(rfam_parser, "source_id", SOURCE_ID_RFAM, SPECIES_ID_HUMAN) + test_parser_missing_argument(rfam_parser, "species_id", SOURCE_ID_RFAM, SPECIES_ID_HUMAN) + test_parser_missing_argument(rfam_parser, "file", SOURCE_ID_RFAM, SPECIES_ID_HUMAN) # Test case to check if parsing is skipped when no species name can be found def test_no_species_name(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None: diff --git a/src/python/test/xrefs/parsers/test_rgd_parser.py b/src/python/test/xrefs/parsers/test_rgd_parser.py index 2b8019c3f..d057b02f4 100644 --- a/src/python/test/xrefs/parsers/test_rgd_parser.py +++ b/src/python/test/xrefs/parsers/test_rgd_parser.py @@ -45,14 +45,10 @@ def run_and_validate_parsing(rgd_parser: RGDParser, mock_xref_dbi: DBConnection, ), f"{prefix}Expected 'Added {expected_synonyms} synonyms, including duplicates' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_rgd_no_source_id(rgd_parser: RGDParser, test_no_source_id: Callable[[RGDParser, int], None]) -> None: - test_no_source_id(rgd_parser, SPECIES_ID_RAT) - -def test_rgd_no_species_id(rgd_parser: RGDParser, test_no_species_id: Callable[[RGDParser, int], None]) -> None: - test_no_species_id(rgd_parser, SOURCE_ID_RGD) - -def test_rgd_no_file(rgd_parser: RGDParser, test_no_file: Callable[[RGDParser, int, int], None]) -> None: - test_no_file(rgd_parser, SOURCE_ID_RGD, SPECIES_ID_RAT) +def test_rgd_missing_argument(rgd_parser: RGDParser, test_parser_missing_argument: Callable[[RGDParser, str, int, int], None]) -> None: + test_parser_missing_argument(rgd_parser, "source_id", SOURCE_ID_RGD, SPECIES_ID_RAT) + test_parser_missing_argument(rgd_parser, "species_id", SOURCE_ID_RGD, SPECIES_ID_RAT) + test_parser_missing_argument(rgd_parser, "file", SOURCE_ID_RGD, SPECIES_ID_RAT) # Test case to check if an error is raised when the file is not found def test_rgd_file_not_found(rgd_parser: RGDParser, test_file_not_found: Callable[[RGDParser, int, int], None]) -> None: @@ -89,7 +85,7 @@ def test_successful_parsing_without_refseqs(mock_xref_dbi: DBConnection, rgd_par # Test case to check successful parsing of valid RGD data with refseqs def test_successful_parsing_with_refseqs(mock_xref_dbi: DBConnection, rgd_parser: RGDParser) -> None: rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT) - rgd_parser.get_valid_codes = MagicMock(return_value={"NM_052979": [12, 34], "XM_039101774" : [56], "XM_063281326": [78]}) + rgd_parser.get_acc_to_xref_ids = MagicMock(return_value={"NM_052979": [12, 34], "XM_039101774" : [56], "XM_063281326": [78]}) # Run and validate parsing for RGD file with existing refseqs run_and_validate_parsing(rgd_parser, mock_xref_dbi, 3, 5, 1, 12) diff --git a/src/python/test/xrefs/parsers/test_ucsc_parser.py b/src/python/test/xrefs/parsers/test_ucsc_parser.py index ae96e4d3f..13e1dc071 100644 --- a/src/python/test/xrefs/parsers/test_ucsc_parser.py +++ b/src/python/test/xrefs/parsers/test_ucsc_parser.py @@ -36,14 +36,10 @@ def run_and_validate_parsing(ucsc_parser: UCSCParser, mock_xref_dbi: DBConnectio ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} UCSC xrefs' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_ucsc_no_source_id(ucsc_parser: UCSCParser, test_no_source_id: Callable[[UCSCParser, int], None]) -> None: - test_no_source_id(ucsc_parser, SPECIES_ID_HUMAN) - -def test_ucsc_no_species_id(ucsc_parser: UCSCParser, test_no_species_id: Callable[[UCSCParser, int], None]) -> None: - test_no_species_id(ucsc_parser, SOURCE_ID_UCSC) - -def test_ucsc_no_file(ucsc_parser: UCSCParser, test_no_file: Callable[[UCSCParser, int, int], None]) -> None: - test_no_file(ucsc_parser, SOURCE_ID_UCSC, SPECIES_ID_HUMAN) +def test_ucsc_missing_argument(ucsc_parser: UCSCParser, test_parser_missing_argument: Callable[[UCSCParser, str, int, int], None]) -> None: + test_parser_missing_argument(ucsc_parser, "source_id", SOURCE_ID_UCSC, SPECIES_ID_HUMAN) + test_parser_missing_argument(ucsc_parser, "species_id", SOURCE_ID_UCSC, SPECIES_ID_HUMAN) + test_parser_missing_argument(ucsc_parser, "file", SOURCE_ID_UCSC, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the file is not found def test_ucsc_file_not_found(ucsc_parser: UCSCParser, test_file_not_found: Callable[[UCSCParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_uniprot_parser.py b/src/python/test/xrefs/parsers/test_uniprot_parser.py index 0cf0e2cc7..c80337717 100644 --- a/src/python/test/xrefs/parsers/test_uniprot_parser.py +++ b/src/python/test/xrefs/parsers/test_uniprot_parser.py @@ -63,6 +63,8 @@ def populate_xref_db(mock_xref_dbi: DBConnection): } ) + mock_xref_dbi.commit() + # Function to run and validate the parsing process def run_and_validate_parsing(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, file:str, expected_xrefs: Dict[str, int], expected_deps: Dict[str, int], prefix: str = None) -> None: if prefix is None: @@ -107,14 +109,10 @@ def run_and_validate_parsing(uniprot_parser: UniProtParser, mock_xref_dbi: DBCon assert f"{count_type}\t{count}" in result_message, f"{prefix}Expected '{count_type}\t{count}' in result_meesgae, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_uniprot_no_source_id(uniprot_parser: UniProtParser, test_no_source_id: Callable[[UniProtParser, int], None]) -> None: - test_no_source_id(uniprot_parser, SPECIES_ID_HUMAN) - -def test_uniprot_no_species_id(uniprot_parser: UniProtParser, test_no_species_id: Callable[[UniProtParser, int], None]) -> None: - test_no_species_id(uniprot_parser, SOURCE_ID_UNIPROT) - -def test_uniprot_no_file(uniprot_parser: UniProtParser, test_no_file: Callable[[UniProtParser, int, int], None]) -> None: - test_no_file(uniprot_parser, SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN) +def test_uniprot_missing_argument(uniprot_parser: UniProtParser, test_parser_missing_argument: Callable[[UniProtParser, str, int, int], None]) -> None: + test_parser_missing_argument(uniprot_parser, "source_id", SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN) + test_parser_missing_argument(uniprot_parser, "species_id", SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN) + test_parser_missing_argument(uniprot_parser, "file", SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN) # Test case to check if an error is raised when the required source_id is missing def test_uniprot_missing_required_source_id(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[UniProtParser, DBConnection, str, int, int, str], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_vgnc_parser.py b/src/python/test/xrefs/parsers/test_vgnc_parser.py index 6ebe58d8d..5fb2297bc 100644 --- a/src/python/test/xrefs/parsers/test_vgnc_parser.py +++ b/src/python/test/xrefs/parsers/test_vgnc_parser.py @@ -36,14 +36,10 @@ def run_and_validate_parsing(vgnc_parser: VGNCParser, mock_xref_dbi: DBConnectio ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} VGNC xrefs and added {expected_synonyms} synonyms' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_vgnc_no_source_id(vgnc_parser: VGNCParser, test_no_source_id: Callable[[VGNCParser, int], None]) -> None: - test_no_source_id(vgnc_parser, SPECIES_ID_P_TROGLODYTES) - -def test_vgnc_no_species_id(vgnc_parser: VGNCParser, test_no_species_id: Callable[[VGNCParser, int], None]) -> None: - test_no_species_id(vgnc_parser, SOURCE_ID_VGNC) - -def test_vgnc_no_file(vgnc_parser: VGNCParser, test_no_file: Callable[[VGNCParser, int, int], None]) -> None: - test_no_file(vgnc_parser, SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) +def test_vgnc_missing_argument(vgnc_parser: VGNCParser, test_parser_missing_argument: Callable[[VGNCParser, str, int, int], None]) -> None: + test_parser_missing_argument(vgnc_parser, "source_id", SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) + test_parser_missing_argument(vgnc_parser, "species_id", SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) + test_parser_missing_argument(vgnc_parser, "file", SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES) # Test case to check if an error is raised when the file is not found def test_vgnc_file_not_found(vgnc_parser: VGNCParser, test_file_not_found: Callable[[VGNCParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py index dda0c7bdc..1a8d36d90 100644 --- a/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py +++ b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py @@ -34,14 +34,10 @@ def run_and_validate_parsing(xenopus_jamboree_parser: XenopusJamboreeParser, moc ), f"{prefix}Expected '{expected_xrefs} XenopusJamboree xrefs successfully parsed' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_xenopus_jamboree_no_source_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_source_id: Callable[[XenopusJamboreeParser, int], None]) -> None: - test_no_source_id(xenopus_jamboree_parser, SPECIES_ID_XENOPUS) - -def test_xenopus_jamboree_no_species_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_species_id: Callable[[XenopusJamboreeParser, int], None]) -> None: - test_no_species_id(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE) - -def test_xenopus_jamboree_no_file(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_file: Callable[[XenopusJamboreeParser, int, int], None]) -> None: - test_no_file(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) +def test_xenopus_jamboree_missing_argument(xenopus_jamboree_parser: XenopusJamboreeParser, test_parser_missing_argument: Callable[[XenopusJamboreeParser, str, int, int], None]) -> None: + test_parser_missing_argument(xenopus_jamboree_parser, "source_id", SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) + test_parser_missing_argument(xenopus_jamboree_parser, "species_id", SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) + test_parser_missing_argument(xenopus_jamboree_parser, "file", SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS) # Test case to check if an error is raised when the file is not found def test_xenopus_jamboree_file_not_found(xenopus_jamboree_parser: XenopusJamboreeParser, test_file_not_found: Callable[[XenopusJamboreeParser, int, int], None]) -> None: diff --git a/src/python/test/xrefs/parsers/test_zfin_desc_parser.py b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py index 1ef373c46..ac3e52eed 100644 --- a/src/python/test/xrefs/parsers/test_zfin_desc_parser.py +++ b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py @@ -6,7 +6,7 @@ from test_helpers import check_row_count # Constants -SOURCE_ID_ZFIN = 1 +SOURCE_ID_ZFIN_DESC = 1 SPECIES_ID_ZEBRAFISH = 7955 # Fixture to create a ZFINDescParser instance @@ -21,7 +21,7 @@ def run_and_validate_parsing(zfin_desc_parser: ZFINDescParser, mock_xref_dbi: DB result_code, result_message = zfin_desc_parser.run( { - "source_id": SOURCE_ID_ZFIN, + "source_id": SOURCE_ID_ZFIN_DESC, "species_id": SPECIES_ID_ZEBRAFISH, "file": "parsers/flatfiles/zfin_desc.txt", "xref_dbi": mock_xref_dbi, @@ -37,22 +37,18 @@ def run_and_validate_parsing(zfin_desc_parser: ZFINDescParser, mock_xref_dbi: DB ), f"{prefix}Expected '{expected_withdrawn} withdrawn entries ignored' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_zfin_desc_no_source_id(zfin_desc_parser: ZFINDescParser, test_no_source_id: Callable[[ZFINDescParser, int], None]) -> None: - test_no_source_id(zfin_desc_parser, SPECIES_ID_ZEBRAFISH) - -def test_zfin_desc_no_species_id(zfin_desc_parser: ZFINDescParser, test_no_species_id: Callable[[ZFINDescParser, int], None]) -> None: - test_no_species_id(zfin_desc_parser, SOURCE_ID_ZFIN) - -def test_zfin_desc_no_file(zfin_desc_parser: ZFINDescParser, test_no_file: Callable[[ZFINDescParser, int, int], None]) -> None: - test_no_file(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) +def test_zfin_desc_missing_argument(zfin_desc_parser: ZFINDescParser, test_parser_missing_argument: Callable[[ZFINDescParser, str, int, int], None]) -> None: + test_parser_missing_argument(zfin_desc_parser, "source_id", SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH) + test_parser_missing_argument(zfin_desc_parser, "species_id", SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH) + test_parser_missing_argument(zfin_desc_parser, "file", SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH) # Test case to check if an error is raised when the file is not found def test_zfin_desc_file_not_found(zfin_desc_parser: ZFINDescParser, test_file_not_found: Callable[[ZFINDescParser, int, int], None]) -> None: - test_file_not_found(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + test_file_not_found(zfin_desc_parser, SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH) # Test case to check if an error is raised when the file is empty def test_zfin_desc_empty_file(zfin_desc_parser: ZFINDescParser, test_empty_file: Callable[[ZFINDescParser, str, int, int], None]) -> None: - test_empty_file(zfin_desc_parser, 'ZFINDesc', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + test_empty_file(zfin_desc_parser, 'ZFINDesc', SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH) # Test case to check successful parsing of valid ZFINDesc data def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_desc_parser: ZFINDescParser) -> None: @@ -60,4 +56,4 @@ def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_desc_parser: ZFIND run_and_validate_parsing(zfin_desc_parser, mock_xref_dbi, 6, 3) # Check the row counts in the xref table - check_row_count(mock_xref_dbi, "xref", 6, f"info_type='MISC' AND source_id={SOURCE_ID_ZFIN}") \ No newline at end of file + check_row_count(mock_xref_dbi, "xref", 6, f"info_type='MISC' AND source_id={SOURCE_ID_ZFIN_DESC}") \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_zfin_parser.py b/src/python/test/xrefs/parsers/test_zfin_parser.py index 060ffa2bc..4972fb1b8 100644 --- a/src/python/test/xrefs/parsers/test_zfin_parser.py +++ b/src/python/test/xrefs/parsers/test_zfin_parser.py @@ -108,14 +108,10 @@ def run_and_validate_parsing(zfin_parser: ZFINParser, mock_xref_dbi: DBConnectio ), f"{prefix}Expected '{expected_synonyms} synonyms loaded' in result_message, but got: '{result_message}'" # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file -def test_zfin_no_source_id(zfin_parser: ZFINParser, test_no_source_id: Callable[[ZFINParser, int], None]) -> None: - test_no_source_id(zfin_parser, SPECIES_ID_ZEBRAFISH) - -def test_zfin_no_species_id(zfin_parser: ZFINParser, test_no_species_id: Callable[[ZFINParser, int], None]) -> None: - test_no_species_id(zfin_parser, SOURCE_ID_ZFIN) - -def test_zfin_no_file(zfin_parser: ZFINParser, test_no_file: Callable[[ZFINParser, int, int], None]) -> None: - test_no_file(zfin_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) +def test_zfin_missing_argument(zfin_parser: ZFINParser, test_parser_missing_argument: Callable[[ZFINParser, str, int, int], None]) -> None: + test_parser_missing_argument(zfin_parser, "source_id", SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + test_parser_missing_argument(zfin_parser, "species_id", SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) + test_parser_missing_argument(zfin_parser, "file", SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH) # Test case to check if an error is raised when the required source_id is missing def test_zfin_missing_required_source_id(zfin_parser: ZFINParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[ZFINParser, DBConnection, str, int, int, str], None]) -> None: diff --git a/src/python/test/xrefs/pytest.ini b/src/python/test/xrefs/pytest.ini index b79469489..dbdc951d2 100644 --- a/src/python/test/xrefs/pytest.ini +++ b/src/python/test/xrefs/pytest.ini @@ -1,2 +1,5 @@ [pytest] +env = + TEST_DB_URL = mysql://USER:PASS@HOST:PORT + TEST_SCRATCH_PATH = /homes/USER/tmp addopts = --verbose --tb=line \ No newline at end of file diff --git a/src/python/test/xrefs/test_checksum.py b/src/python/test/xrefs/test_checksum.py new file mode 100644 index 000000000..4d86ad0c7 --- /dev/null +++ b/src/python/test/xrefs/test_checksum.py @@ -0,0 +1,104 @@ +import pytest +import os +import shutil +import datetime +from typing import Any, Dict, Callable, Optional +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count + +from ensembl.production.xrefs.Checksum import Checksum + +DEFAULT_ARGS = { + "base_path": "dummy_base_path", + "source_db_url": "mysql://user:pass@host/db", + "skip_download": False, +} + +# Fixture to create a Checksum instance +@pytest.fixture +def checksum() -> Callable[[Optional[Dict[str, Any]]], Checksum]: + def _create_checksum(args: Optional[Dict[str, Any]] = None) -> Checksum: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return Checksum(args, True, True) + return _create_checksum + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_checksum_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("Checksum", DEFAULT_ARGS, "base_path") + test_missing_required_param("Checksum", DEFAULT_ARGS, "source_db_url") + test_missing_required_param("Checksum", DEFAULT_ARGS, "skip_download") + +# Test case to check successful run +def test_successful_run(mock_source_dbi: DBConnection, checksum: Checksum, pytestconfig: pytest.Config): + # Setup for test parameters and create a Checksum instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = { + "base_path": test_scratch_path, + "source_db_url": mock_source_dbi.engine.url, + "skip_download": False, + } + checksum_instance = checksum(args) + + checksum_path = os.path.join(test_scratch_path, "Checksum") + checksum_file = os.path.join(checksum_path, "checksum.txt") + try: + # Run the Checksum instance without checksum source files + checksum_instance.run() + + # Check that the Checksum folder was created + assert os.path.exists(test_scratch_path), "Checksum folder was not created" + + # Check that no checksum.txt file was created + assert not os.path.exists(checksum_file), "File checksum.txt was created" + + # Copy some checksum files into the Checksum folder + shutil.copy("flatfiles/RNACentral-md5.tsv.gz", checksum_path) + shutil.copy("flatfiles/UniParc-upidump.lis", checksum_path) + + # Run the Checksum instance again + checksum_instance.run() + + # Check that the checksum.txt file was created and is not empty + assert os.path.exists(checksum_file), "File checksum.txt was not created" + assert os.path.getsize(checksum_file) > 0, "File checksum.txt is empty" + + # Get the last modified time and size of the file + timestamp = os.path.getmtime(checksum_file) + last_modified = datetime.datetime.fromtimestamp(timestamp) + size = os.path.getsize(checksum_file) + + # Check that the checksum rows were added + check_row_count(mock_source_dbi, "checksum_xref", 30) + + # Run the Checksum instance again + checksum_instance.run() + + # Check that the checksum.txt file was created again + timestamp = os.path.getmtime(checksum_file) + new_last_modified = datetime.datetime.fromtimestamp(timestamp) + assert new_last_modified > last_modified, "File checksum.txt was created again" + assert os.path.getsize(checksum_file) == size, "File checksum.txt does not have the same size" + last_modified = new_last_modified + + # Check that the checksum rows are still the same + check_row_count(mock_source_dbi, "checksum_xref", 30) + + # Set the skip_download parameter to True + checksum_instance.set_param("skip_download", True) + + # Run the Checksum instance again + checksum_instance.run() + + # Check that the checksum.txt file was not created again + timestamp = os.path.getmtime(checksum_file) + new_last_modified = datetime.datetime.fromtimestamp(timestamp) + assert new_last_modified == last_modified, "File checksum.txt was created again" + + # Check that the checksum rows are still the same + check_row_count(mock_source_dbi, "checksum_xref", 30) + finally: + # Cleanup: Remove the Checksum folder if it exists + if os.path.exists(checksum_path): + shutil.rmtree(checksum_path) \ No newline at end of file diff --git a/src/python/test/xrefs/test_download_source.py b/src/python/test/xrefs/test_download_source.py new file mode 100644 index 000000000..4e537ab5b --- /dev/null +++ b/src/python/test/xrefs/test_download_source.py @@ -0,0 +1,118 @@ +import pytest +import os +import shutil +import datetime +from typing import Any, Dict, Callable, Optional +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count + +from ensembl.production.xrefs.DownloadSource import DownloadSource + +DEFAULT_ARGS = { + "base_path": "dummy_base_path", + "parser": "dummy_parser", + "name": "dummy_name", + "priority": 1, + "source_db_url": "mysql://user:pass@host/db", + "file": "dummy_file", + "skip_download": False, +} + +# Fixture to create a DownloadSource instance +@pytest.fixture +def download_source() -> Callable[[Optional[Dict[str, Any]]], DownloadSource]: + def _create_download_source(args: Optional[Dict[str, Any]] = None) -> DownloadSource: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return DownloadSource(args, True, True) + return _create_download_source + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_download_source_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "base_path") + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "parser") + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "name") + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "priority") + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "source_db_url") + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "file") + test_missing_required_param("DownloadSource", DEFAULT_ARGS, "skip_download") + +# Test case to check if an error is raised when an invalid URL scheme is provided +def test_invalid_url_scheme(download_source: DownloadSource, pytestconfig): + # Setup for test parameters and create a ScheduleDownload instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = DEFAULT_ARGS.copy() + args["base_path"] = test_scratch_path + args["file"] = "wrong://dummy_file" + download_source_instance = download_source(args) + + try: + # Run the DownloadSource instance + with pytest.raises( + AttributeError, match="Invalid URL scheme wrong" + ): + download_source_instance.run() + finally: + # Cleanup: Remove the created path if it exists + dummy_source_path = os.path.join(test_scratch_path, "dummy_name") + if os.path.exists(dummy_source_path): + shutil.rmtree(dummy_source_path) + +# TO DO: Add test cases to check for ftp and copy cases + downloading version files + +# Test case to check successful run +def test_successful_run(mock_source_dbi: DBConnection, download_source: DownloadSource, pytestconfig: pytest.Config): + # Setup for test parameters and create a ScheduleDownload instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = { + "base_path": test_scratch_path, + "parser": "DBASSParser", + "name": "DBASS3", + "priority": 1, + "source_db_url": mock_source_dbi.engine.url, + "file": "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv", + "skip_download": False, + } + download_source_instance = download_source(args) + + try: + # Run the DownloadSource instance + download_source_instance.run() + + # Check if the file was downloaded + file_path = os.path.join(test_scratch_path, "DBASS3", "DownloadCsv") + assert os.path.exists(file_path), "DBASS3 file not downloaded into the correct path" + + # Check if the source was added to the source table + check_row_count(mock_source_dbi, "source", 1) + check_row_count(mock_source_dbi, "version", 1) + + # Get the last modified time of the file + timestamp = os.path.getmtime(file_path) + last_modified = datetime.datetime.fromtimestamp(timestamp) + + # Run the DownloadSource instance again + download_source_instance.run() + + # Check that the file was downloaded again + timestamp = os.path.getmtime(file_path) + new_last_modified = datetime.datetime.fromtimestamp(timestamp) + assert new_last_modified > last_modified, "DBASS3 file not downloaded again" + last_modified = new_last_modified + + # Set the skip_download parameter to True + download_source_instance.set_param("skip_download", True) + + # Run the DownloadSource instance again + download_source_instance.run() + + # Check that the file was not downloaded again + timestamp = os.path.getmtime(file_path) + new_last_modified = datetime.datetime.fromtimestamp(timestamp) + assert new_last_modified == last_modified, "DBASS3 file downloaded again" + finally: + # Cleanup: Remove the created file and path if it exists + source_path = os.path.join(test_scratch_path, "DBASS3") + if os.path.exists(source_path): + shutil.rmtree(source_path) \ No newline at end of file diff --git a/src/python/test/xrefs/test_helpers.py b/src/python/test/xrefs/test_helpers.py index efe1e35d4..6cbac3910 100644 --- a/src/python/test/xrefs/test_helpers.py +++ b/src/python/test/xrefs/test_helpers.py @@ -1,6 +1,8 @@ +import json from sqlalchemy import text +from typing import List, Dict, Any -from ensembl.utils.database import UnitTestDB, DBConnection +from ensembl.utils.database import DBConnection # Helper function to check the row count in a specific table def check_row_count(db: DBConnection, table: str, expected_count: int, where_clause: str = None) -> None: @@ -77,4 +79,23 @@ def check_release(db: DBConnection, source_id: str, expected_release: str) -> No ).scalar() assert ( release == expected_release - ), f"Expected release info '{expected_release}' for source_id {source_id}, but got '{release}'" \ No newline at end of file + ), f"Expected release info '{expected_release}' for source_id {source_id}, but got '{release}'" + +# Helper function to check the dataflow content of a dataflow file +def check_dataflow_content(dataflow_file_path: str, expected_content: List[Dict[str, Any]]) -> None: + # Get the content of the dataflow file + actual_content = [] + with open(dataflow_file_path) as fh: + for line in fh: + actual_content.append(json.loads(line.strip())) + + # Sort both the expected and actual content lists + actual_content_sorted = sorted(actual_content, key=lambda x: json.dumps(x, sort_keys=True)) + expected_content_sorted = sorted(expected_content, key=lambda x: json.dumps(x, sort_keys=True)) + + # Compare the expected and actual content + assert actual_content_sorted == expected_content_sorted, ( + f"Dataflow file content does not match expected content.\n" + f"Expected (sorted): {expected_content_sorted}\n" + f"Actual (sorted): {actual_content_sorted}" + ) \ No newline at end of file diff --git a/src/python/test/xrefs/test_schedule_alignment.py b/src/python/test/xrefs/test_schedule_alignment.py new file mode 100644 index 000000000..2254a58e2 --- /dev/null +++ b/src/python/test/xrefs/test_schedule_alignment.py @@ -0,0 +1,106 @@ +import pytest +import os +import shutil +from typing import Any, Dict, Callable, Optional +from test_helpers import check_dataflow_content + +from ensembl.production.xrefs.ScheduleAlignment import ScheduleAlignment + +DEFAULT_ARGS = { + "species_name": "homo_sapiens", + "release": 999, + "ensembl_fasta": "dummy_ensembl_fasta.fa", + "xref_fasta": "dummy_xref_fasta.fa", + "seq_type": "peptide", + "xref_db_url": "mysql://user:pass@host/xref_db", + "base_path": "dummy_base_path", + "method": "--bestn 1", + "query_cutoff": 100, + "target_cutoff": 100, + "source_id": 1, + "source_name": "RefSeq_peptide", + "job_index": 1, + "chunk_size": 4000 +} + +# Fixture to create a ScheduleAlignment instance +@pytest.fixture +def schedule_alignment() -> Callable[[Optional[Dict[str, Any]]], ScheduleAlignment]: + def _create_schedule_alignment(args: Optional[Dict[str, Any]] = None) -> ScheduleAlignment: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return ScheduleAlignment(args, True, True) + return _create_schedule_alignment + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_schedule_alignment_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "species_name") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "release") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "ensembl_fasta") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_fasta") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "seq_type") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_db_url") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "base_path") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "method") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "query_cutoff") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "target_cutoff") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_id") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_name") + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "job_index") + +# Test case to check successful run +def test_successful_run(schedule_alignment: ScheduleAlignment, pytestconfig: pytest.Config): + # Setup for test parameters and create a ScheduleAlignment instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = DEFAULT_ARGS.copy() + args["base_path"] = test_scratch_path + args["dataflow_output_path"] = test_scratch_path + schedule_alignment_instance = schedule_alignment(args) + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_alignment.json") + try: + # Create the appropriate paths and copy a fasta file + ensembl_path = schedule_alignment_instance.get_path(test_scratch_path, "homo_sapiens", 999, "ensembl") + shutil.copy("flatfiles/peptides.fa", ensembl_path) + ensembl_file_path = os.path.join(ensembl_path, "peptides.fa") + schedule_alignment_instance.set_param("ensembl_fasta", ensembl_file_path) + + # Run the ScheduleAlignment instance + schedule_alignment_instance.run() + + # Check that an alignment path was created + alignment_path = os.path.join(test_scratch_path, "homo_sapiens", "999", "alignment") + assert os.path.exists(alignment_path), f"Expected path {alignment_path} not created" + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + { + "species_name": "homo_sapiens", "align_method": "--bestn 1", "query_cutoff": 100, "target_cutoff": 100, "max_chunks": 3, "chunk": 1, + "job_index": 1, "source_file": "dummy_xref_fasta.fa", "target_file": ensembl_file_path, "xref_db_url": "mysql://user:pass@host/xref_db", + "map_file": os.path.join(alignment_path, "peptide_alignment_1_1_of_3.map"), "source_id": 1, "source_name": "RefSeq_peptide", "seq_type": "peptide" + }, + { + "species_name": "homo_sapiens", "align_method": "--bestn 1", "query_cutoff": 100, "target_cutoff": 100, "max_chunks": 3, "chunk": 2, + "job_index": 1, "source_file": "dummy_xref_fasta.fa", "target_file": ensembl_file_path, "xref_db_url": "mysql://user:pass@host/xref_db", + "map_file": os.path.join(alignment_path, "peptide_alignment_1_2_of_3.map"), "source_id": 1, "source_name": "RefSeq_peptide", "seq_type": "peptide" + }, + { + "species_name": "homo_sapiens", "align_method": "--bestn 1", "query_cutoff": 100, "target_cutoff": 100, "max_chunks": 3, "chunk": 3, + "job_index": 1, "source_file": "dummy_xref_fasta.fa", "target_file": ensembl_file_path, "xref_db_url": "mysql://user:pass@host/xref_db", + "map_file": os.path.join(alignment_path, "peptide_alignment_1_3_of_3.map"), "source_id": 1, "source_name": "RefSeq_peptide", "seq_type": "peptide" + } + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) + + # Cleanup: Remove the homo_sapiens folder if it exists + ensembl_path = os.path.join(test_scratch_path, "homo_sapiens") + if os.path.exists(ensembl_path): + shutil.rmtree(ensembl_path) \ No newline at end of file diff --git a/src/python/test/xrefs/test_schedule_cleanup.py b/src/python/test/xrefs/test_schedule_cleanup.py new file mode 100644 index 000000000..6d8c8721d --- /dev/null +++ b/src/python/test/xrefs/test_schedule_cleanup.py @@ -0,0 +1,116 @@ +import pytest +import os +import shutil +from sqlalchemy import text +from typing import Any, Dict, Callable, Optional +from ensembl.utils.database import DBConnection +from test_helpers import check_dataflow_content + +from ensembl.production.xrefs.ScheduleCleanup import ScheduleCleanup + +DEFAULT_ARGS = { + "base_path": "dummy_base_path", + "source_db_url": "mysql://user:pass@host/db", +} + +# Fixture to create a ScheduleCleanup instance +@pytest.fixture +def schedule_cleanup() -> Callable[[Optional[Dict[str, Any]]], ScheduleCleanup]: + def _create_schedule_cleanup(args: Optional[Dict[str, Any]] = None) -> ScheduleCleanup: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return ScheduleCleanup(args, True, True) + return _create_schedule_cleanup + +# Function to populate the database with sources +def populate_source_db(mock_source_dbi: DBConnection): + source_data = [ + [1, 'DBASS3', 'DBASSParser'], + [2, 'RefSeq_dna', 'RefSeqParser'], + [3, 'Uniprot/SWISSPROT', 'UniProtParser'], + [4, 'VGNC', 'VGNCParser'], + ] + for row in source_data: + mock_source_dbi.execute( + text("INSERT INTO source (source_id, name, parser) VALUES (:source_id, :name, :parser)"), + {"source_id": row[0], "name": row[1], "parser": row[2],} + ) + + version_data = [ + [1, 1, ''], + [2, 2, 'dummy_base_path/RefSeq_dna/RefSeq-release200.txt'], + [3, 1, 'dummy_base_path/UniprotSWISSPROT/reldate.txt'], + [4, 1, ''], + ] + for row in version_data: + mock_source_dbi.execute( + text("INSERT INTO version (source_id, priority, revision) VALUES (:source_id, :priority, :revision)"), + {"source_id": row[0], "priority": row[1], "revision": row[2],} + ) + + mock_source_dbi.commit() + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_schedule_cleanup_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("ScheduleCleanup", DEFAULT_ARGS, "base_path") + test_missing_required_param("ScheduleCleanup", DEFAULT_ARGS, "source_db_url") + +# Test case to check successful run +def test_successful_run(mock_source_dbi: DBConnection, schedule_cleanup: ScheduleCleanup, pytestconfig: pytest.Config): + # Setup for test parameters and create a ScheduleCleanup instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = { + "base_path": test_scratch_path, + "source_db_url": mock_source_dbi.engine.url, + "dataflow_output_path": test_scratch_path + } + schedule_cleanup_instance = schedule_cleanup(args) + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_cleanup_sources.json") + try: + # Run the DownloadSource instance without any sources to clean up + schedule_cleanup_instance.run() + + # Check that the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check that the dataflow file is empty then remove it + assert os.path.getsize(dataflow_file_path) == 0, f"Expected file {dataflow_file_path} to be empty" + os.remove(dataflow_file_path) + + # Add source data into source db + populate_source_db(mock_source_dbi) + + # Run the ScheduleCleanup instance again without existing source folders + schedule_cleanup_instance.run() + + # Check that the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check that the dataflow file is empty then remove it + assert os.path.getsize(dataflow_file_path) == 0, f"Expected file {dataflow_file_path} to be empty" + os.remove(dataflow_file_path) + + # Create source folders for cleanup + os.makedirs(f"{test_scratch_path}/RefSeq_dna") + os.makedirs(f"{test_scratch_path}/UniprotSWISSPROT") + + # Run the ScheduleCleanup instance again + schedule_cleanup_instance.run() + + # Check the content of the dataflow file + expected_content = [ + {"name": "RefSeq_dna", "version_file": "dummy_base_path/RefSeq_dna/RefSeq-release200.txt"}, + {"name": "Uniprot/SWISSPROT", "version_file": "dummy_base_path/UniprotSWISSPROT/reldate.txt"} + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) + + # Cleanup: Remove the created paths if they exist + for source_path in [os.path.join(test_scratch_path, "RefSeq_dna"), os.path.join(test_scratch_path, "UniprotSWISSPROT")]: + if os.path.exists(source_path): + shutil.rmtree(source_path) diff --git a/src/python/test/xrefs/test_schedule_download.py b/src/python/test/xrefs/test_schedule_download.py new file mode 100644 index 000000000..8c17eb123 --- /dev/null +++ b/src/python/test/xrefs/test_schedule_download.py @@ -0,0 +1,116 @@ +import pytest +import io +import json +import os +from datetime import datetime +from unittest.mock import MagicMock, patch +from typing import Any, Dict, Callable, Optional +from sqlalchemy import create_engine, text +from sqlalchemy.engine.url import make_url +from test_helpers import check_dataflow_content + +from ensembl.production.xrefs.ScheduleDownload import ScheduleDownload + +DEFAULT_ARGS = { + "config_file": "dummy_config.json", + "source_db_url": "mysql://user:pass@host/db", + "reuse_db": False, +} + +# Fixture to create a ScheduleDownload instance +@pytest.fixture +def schedule_download() -> Callable[[Optional[Dict[str, Any]]], ScheduleDownload]: + def _create_schedule_download(args: Optional[Dict[str, Any]] = None) -> ScheduleDownload: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return ScheduleDownload(args, True, True) + return _create_schedule_download + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_schedule_download_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("ScheduleDownload", DEFAULT_ARGS, "config_file") + test_missing_required_param("ScheduleDownload", DEFAULT_ARGS, "source_db_url") + test_missing_required_param("ScheduleDownload", DEFAULT_ARGS, "reuse_db") + +# Test case to check if an error is raised when the config file has an invalid json format +def test_invalid_config_file(schedule_download: ScheduleDownload): + # Create a ScheduleDownload instance + schedule_download_instance = schedule_download() + + # Create an invalid json file + mock_file = io.StringIO('[{"name": "source1", "parser": "parser1", "priority": 1, "file": "file1",}]') + with patch("ensembl.production.xrefs.ScheduleDownload.open", return_value=mock_file, create=True): + # Mock the create_source_db method + schedule_download_instance.create_source_db = MagicMock() + + # Run the ScheduleDownload instance + with pytest.raises(json.decoder.JSONDecodeError): + schedule_download_instance.run() + +# Test case to check if an error is raised when the config file is empty +def test_empty_config_file(schedule_download: ScheduleDownload): + # Create a ScheduleDownload instance + schedule_download_instance = schedule_download() + + # Create an empty json file + mock_file = io.StringIO('[]') + with patch("ensembl.production.xrefs.ScheduleDownload.open", return_value=mock_file, create=True): + # Mock the create_source_db method + schedule_download_instance.create_source_db = MagicMock() + + # Run the ScheduleDownload instance + with pytest.raises( + ValueError, match="No sources found in config file dummy_config.json. Need sources to run pipeline" + ): + schedule_download_instance.run() + +# TO DO: Add test case for reuse_db set to True + +# Test case to check successful run +def test_successful_run(schedule_download: ScheduleDownload, pytestconfig): + # Setup for test parameters and create a ScheduleDownload instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + test_mysql_url = pytestconfig.getoption("test_db_url") + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + user_name = os.getenv("USER", "default_user") + test_db_name = f"{user_name}_test_xref_source_db_{timestamp}" + args = { + "config_file": "flatfiles/sources.json", + "source_db_url": f"{test_mysql_url}/{test_db_name}", + "reuse_db": False, + "dataflow_output_path": test_scratch_path + } + schedule_download_instance = schedule_download(args) + + # Create a db engine for connection + test_engine = create_engine(make_url(test_mysql_url), isolation_level="AUTOCOMMIT") + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_sources.json") + try: + # Run the ScheduleDownload instance + schedule_download_instance.run() + + # Check if the source db was created + with test_engine.connect() as conn: + result = conn.execute(text("SHOW DATABASES")) + db_names = [row[0] for row in result.fetchall()] + assert test_db_name in db_names, f"Expected database {test_db_name} not found" + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + {"parser": "ArrayExpressParser", "name": "ArrayExpress", "priority": 1, "db": "core", "file": "Database"}, + {"parser": "ChecksumParser", "name": "RNACentral", "priority": 1, "db": "checksum", "file": "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz"} + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup: Drop the test database if it exists + with test_engine.connect() as conn: + conn.execute(text(f"DROP DATABASE IF EXISTS {test_db_name}")) + + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) diff --git a/src/python/test/xrefs/test_schedule_parse.py b/src/python/test/xrefs/test_schedule_parse.py new file mode 100644 index 000000000..04b3cd4ca --- /dev/null +++ b/src/python/test/xrefs/test_schedule_parse.py @@ -0,0 +1,224 @@ +import pytest +import os +from unittest.mock import MagicMock +from sqlalchemy import create_engine, text +from sqlalchemy.engine.url import make_url +from typing import Any, Dict, Callable, Optional +from ensembl.utils.database import DBConnection +from test_helpers import check_row_count, check_dataflow_content + +from ensembl.production.xrefs.ScheduleParse import ScheduleParse + +DEFAULT_ARGS = { + "species_name": "test_homo_sapiens_test", + "release": 999, + "registry_url": "http://dummy_registry", + "priority": 1, + "source_db_url": "mysql://user:pass@host/source_db", + "xref_db_url": "mysql://user:pass@host/xref_db", + "get_species_file": False, + "species_db": "mysql://user:pass@host/core_db", +} + +# Fixture to create a ScheduleParse instance +@pytest.fixture +def schedule_parse() -> Callable[[Optional[Dict[str, Any]]], ScheduleParse]: + def _create_schedule_parse(args: Optional[Dict[str, Any]] = None) -> ScheduleParse: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return ScheduleParse(args, True, True) + return _create_schedule_parse + +# Function to populate the database with sources +def populate_source_db(mock_source_dbi: DBConnection): + source_data = [ + [1, 'ArrayExpress', 'ArrayExpressParser'], + [2, 'UniParc', 'ChecksumParser'], + [3, 'DBASS3', 'DBASSParser'], + [4, 'MIM', 'MIMParser'], + [5, 'Reactome', 'ReactomeParser'], + [6, 'RefSeq_dna', 'RefSeqParser'], + [7, 'RefSeq_peptide', 'RefSeqParser'], + [8, 'VGNC', 'VGNCParser'], + ] + for row in source_data: + mock_source_dbi.execute( + text("INSERT INTO source (source_id, name, parser) VALUES (:source_id, :name, :parser)"), + {"source_id": row[0], "name": row[1], "parser": row[2],} + ) + + version_data = [ + [1, 'Database', 'core', 1, None, None], + [2, 'dummy_uniparc_file_path', 'checksum', 1, None, None], + [3, 'dummy_dbass_file_path', None, 1, None, None], + [4, 'dummy_mim_file_path', None, 2, None, None], + [5, 'dummy_reactome_file_path', None, 2, 'dummy_reactome_release', None], + [6, 'dummy_refseq_dna_file_path', None, 2, 'dummy_refseq_dna_release', 'dummy_refseq_dna_clean_path'], + [7, 'dummy_refseq_peptide_file_path', None, 3, 'dummy_refseq_peptide_release', 'dummy_refseq_peptide_clean_path'], + [8, 'dummy_vgnc_file_path', None, 1, None, None], + ] + for row in version_data: + mock_source_dbi.execute( + text("INSERT INTO version (source_id, file_path, db, priority, revision, clean_path) VALUES (:source_id, :file_path, :db, :priority, :revision, :clean_path)"), + {"source_id": row[0], "file_path": row[1], "db": row[2], "priority": row[3], "revision": row[4], "clean_path": row[5]} + ) + + mock_source_dbi.commit() + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_schedule_parse_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "species_name") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "release") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "registry_url") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "priority") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "source_db_url") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "xref_db_url") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "get_species_file") + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "sources_config_file") + +# Test case to check if an error is raised when priority is invalid +def test_invalid_priority(schedule_parse: ScheduleParse): + args = DEFAULT_ARGS.copy() + args["priority"] = 4 + schedule_parse_instance = schedule_parse(args) + + with pytest.raises(AttributeError, match="Parameter 'priority' can only be of value 1, 2, or 3"): + schedule_parse_instance.run() + +# Test case to check successful run +def test_successful_run(mock_source_dbi: DBConnection, schedule_parse: ScheduleParse, pytestconfig): + # Setup for test parameters and create a ScheduleParse instance + test_scratch_path = pytestconfig.getoption("test_scratch_path") + test_mysql_url = pytestconfig.getoption("test_db_url") + args = DEFAULT_ARGS.copy() + args["source_db_url"] = mock_source_dbi.engine.url + args["xref_db_url"] = test_mysql_url + args["dataflow_output_path"] = test_scratch_path + args["sources_config_file"] = "flatfiles/config.ini" + schedule_parse_instance = schedule_parse(args) + + # Add source data into source db + populate_source_db(mock_source_dbi) + + # Mock needed methods + schedule_parse_instance.get_core_db_info = MagicMock(return_value=(9606, 7742)) + + # Create a db engine for connection + test_engine = create_engine(make_url(test_mysql_url), isolation_level="AUTOCOMMIT") + + try: + # Run the ScheduleParse instance with priority 1 + schedule_parse_instance.run() + + # Check if the xref update db was created + with test_engine.connect() as conn: + result = conn.execute(text("SHOW DATABASES")) + db_names = [row[0] for row in result.fetchall()] + assert "test_homo_sapiens_test_xref_update_999" in db_names, "Expected database test_homo_sapiens_test_xref_update_999 not found" + + # Connect to the db itself and create a table + db_engine = create_engine(make_url(f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"), isolation_level="AUTOCOMMIT") + with db_engine.connect() as db_conn: + check_row_count(db_conn, "source", 11) + check_row_count(db_conn, "source_url", 14) + check_row_count(db_conn, "species", 3) + + # Get the source ids + source_ids = {} + result = db_conn.execute(text("SELECT source_id,name,priority_description FROM source")).all() + for row in result: + if source_ids.get(row[1]): + source_ids[row[1]].update({row[2]: row[0]}) + else: + source_ids[row[1]] = {row[2]: row[0]} + + # Get + + # Check the dataflow files + expected_content = { + "primary_sources": [ + { + "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999", + "source_id": source_ids["ArrayExpress"]["multi"], "source_name": "ArrayExpress", "parser": "ArrayExpressParser", "db": "core", "file_name": "Database" + }, + { + "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999", + "source_id": source_ids["DBASS3"]["human"], "source_name": "DBASS3", "parser": "DBASSParser", "file_name": "dummy_dbass_file_path" + } + ], + "schedule_secondary": [ + {"species_name": "test_homo_sapiens_test", "species_db": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"} + ] + } + for dataflow_file in ["primary_sources", "schedule_secondary"]: + # Check if the dataflow file is created + dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json") + + # Check the content of the dataflow file + check_dataflow_content(dataflow_file_path, expected_content[dataflow_file]) + + # Run the ScheduleParse instance again with priority 2 + schedule_parse_instance.set_param("priority", 2) + schedule_parse_instance.set_param("xref_db_url", f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999") + schedule_parse_instance.run() + + # Check the dataflow files + expected_content = { + "secondary_sources": [ + { + "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999", + "source_id": source_ids["MIM"]["human"], "source_name": "MIM", "parser": "MIMParser", "file_name": "dummy_mim_file_path" + }, + { + "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999", + "source_id": source_ids["Reactome"]["multi"], "source_name": "Reactome", "parser": "ReactomeParser", "release_file": "dummy_reactome_release", "file_name": "dummy_reactome_file_path" + }, + { + "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999", + "source_id": source_ids["RefSeq_dna"]["human"], "source_name": "RefSeq_dna", "parser": "RefSeqParser", "release_file": "dummy_refseq_dna_release", "file_name": "dummy_refseq_dna_clean_path" + } + ], + "schedule_tertiary": [ + {"species_name": "test_homo_sapiens_test", "species_db": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"} + ] + } + for dataflow_file in ["secondary_sources", "schedule_tertiary"]: + # Check if the dataflow file is created + dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json") + + # Check the content of the dataflow file + check_dataflow_content(dataflow_file_path, expected_content[dataflow_file]) + + # Run the ScheduleParse instance again with priority 2 + schedule_parse_instance.set_param("priority", 3) + schedule_parse_instance.run() + + # Check the dataflow files + expected_content = { + "tertiary_sources": [ + { + "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999", + "source_id": source_ids["RefSeq_peptide"]["human"], "source_name": "RefSeq_peptide", "parser": "RefSeqParser", "release_file": "dummy_refseq_peptide_release", "file_name": "dummy_refseq_peptide_clean_path" + } + ], + "dump_ensembl": [ + {"species_name": "test_homo_sapiens_test", "species_db": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"} + ] + } + for dataflow_file in ["tertiary_sources", "dump_ensembl"]: + # Check if the dataflow file is created + dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json") + + # Check the content of the dataflow file + check_dataflow_content(dataflow_file_path, expected_content[dataflow_file]) + finally: + # Cleanup: Drop the test database if it exists + with test_engine.connect() as conn: + conn.execute(text("DROP DATABASE IF EXISTS test_homo_sapiens_test_xref_update_999")) + + # Cleanup: Remove the dataflow files if they exist + for dataflow_file in ["primary_sources", "schedule_secondary", "secondary_sources", "schedule_tertiary", "tertiary_sources", "dump_ensembl"]: + dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json") + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) diff --git a/src/python/test/xrefs/test_schedule_species.py b/src/python/test/xrefs/test_schedule_species.py new file mode 100644 index 000000000..a018f7861 --- /dev/null +++ b/src/python/test/xrefs/test_schedule_species.py @@ -0,0 +1,358 @@ +import pytest +import os +import re +from typing import Any, Dict, Callable, Optional, List +from sqlalchemy import create_engine, text +from sqlalchemy.engine.url import make_url +from test_helpers import check_dataflow_content + +from ensembl.production.xrefs.ScheduleSpecies import ScheduleSpecies + +DEFAULT_ARGS = { + "run_all": False, + "registry_url": "http://dummy_registry", + "release": 999, + "metasearch_url": "http://dummy_metasearch", +} + +# Fixture to create a ScheduleSpecies instance +@pytest.fixture +def schedule_species() -> Callable[[Optional[Dict[str, Any]]], ScheduleSpecies]: + def _create_schedule_species(args: Optional[Dict[str, Any]] = None) -> ScheduleSpecies: + # Use provided args or default to default_args + args = args or DEFAULT_ARGS + + return ScheduleSpecies(args, True, True) + return _create_schedule_species + +# Function to create dbs in the registry +def create_dbs_in_registry(registry_url: str, dbs: Dict[str, Dict[str, Any]]) -> List[str]: + dbs_to_cleanup = [] + + test_engine = create_engine(make_url(registry_url), isolation_level="AUTOCOMMIT") + with test_engine.connect() as conn: + # Get all dbs in the registry first + existing_dbs = conn.execute(text(f"SHOW DATABASES")).fetchall() + existing_dbs = [db[0] for db in existing_dbs] + + # Create the dbs that are not already in the registry + for db_name, db_meta in dbs.items(): + if db_name not in existing_dbs: + conn.execute(text(f"CREATE DATABASE {db_name}")) + dbs_to_cleanup.append(db_name) + + release = db_meta.get("release") + division = db_meta.get("division") + + # Connect to the db itself and create a table + db_engine = create_engine(make_url(f"{registry_url}/{db_name}"), isolation_level="AUTOCOMMIT") + with db_engine.connect() as db_conn: + db_conn.execute(text("CREATE TABLE dna (seq_region_id INT(10) PRIMARY KEY, sequence VARCHAR(255) NOT NULL)")) + db_conn.execute(text("CREATE TABLE meta (meta_id INT(10) AUTO_INCREMENT PRIMARY KEY, species_id INT(10) DEFAULT 1, meta_key VARCHAR(40) NOT NULL, meta_value VARCHAR(255) NOT NULL)")) + db_conn.execute(text(f"INSERT INTO meta (meta_key, meta_value) VALUES ('schema_version', '{release}')")) + if division: + db_conn.execute(text(f"INSERT INTO meta (meta_key, meta_value) VALUES ('species.division', '{division}')")) + + return dbs_to_cleanup + +# Function to cleanup dbs in the registry +def cleanup_dbs_in_registry(registry_url: str, dbs: List[str]) -> None: + test_engine = create_engine(make_url(registry_url), isolation_level="AUTOCOMMIT") + with test_engine.connect() as conn: + for db in dbs: + conn.execute(text(f"DROP DATABASE IF EXISTS {db}")) + +def clean_registry_url(registry_url: str) -> str: + match = re.search(r"^(.*)://(.*)", registry_url) + if match: + registry_url = match.group(2) + match = re.search(r"(.*)/(.*)$", registry_url) + if match: + registry_url = match.group(1) + + return registry_url + +# Test case to check if an error is raised when a mandatory parameter is missing +def test_schedule_species_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): + test_missing_required_param("ScheduleSpecies", DEFAULT_ARGS, "run_all") + test_missing_required_param("ScheduleSpecies", DEFAULT_ARGS, "registry_url") + test_missing_required_param("ScheduleSpecies", DEFAULT_ARGS, "release") + +# Test case to check if an error is raised when no species or division are provided +def test_invalid_input(schedule_species: ScheduleSpecies): + # Create a ScheduleSpecies instance + schedule_species_instance = schedule_species() + + with pytest.raises(ValueError, match="Must provide species or division with run_all set to False"): + schedule_species_instance.run() + +# Test case to check if an error is raised when no dbs are found (empty registry) +def test_no_dbs_found(schedule_species: ScheduleSpecies): + # Create a ScheduleSpecies instance + args = DEFAULT_ARGS.copy() + args["run_all"] = True + schedule_species_instance = schedule_species(args) + + with pytest.raises(LookupError, match="Could not find any matching dbs in registry dummy_registry"): + schedule_species_instance.run() + +# Test case to check if an error is raised when a species db is present more than once +def test_duplicate_species_dbs(schedule_species: ScheduleSpecies, pytestconfig): + # Create a ScheduleSpecies instance + test_mysql_url = pytestconfig.getoption("test_db_url") + args = DEFAULT_ARGS.copy() + args["registry_url"] = test_mysql_url + args["run_all"] = True + schedule_species_instance = schedule_species(args) + + # Create the dbs in the registry + dbs = { + "bos_taurus_core_999_1" : {"release": 999}, + "bos_taurus_core_999_1_temp": {"release": 999}, + } + created_dbs = create_dbs_in_registry(test_mysql_url, dbs) + + clean_url = clean_registry_url(test_mysql_url) + try: + with pytest.raises(ValueError, match=f"Database {clean_url}/bos_taurus_core_999_1 already loaded for species bos_taurus, cannot load second database {clean_url}/bos_taurus_core_999_1_temp"): + schedule_species_instance.run() + finally: + # Cleanup the dbs in the registry + cleanup_dbs_in_registry(test_mysql_url, created_dbs) + +# Test case to check if an error is raised when a requested species is not found +def test_species_not_found(schedule_species: ScheduleSpecies, pytestconfig): + # Create a ScheduleSpecies instance + test_mysql_url = pytestconfig.getoption("test_db_url") + args = DEFAULT_ARGS.copy() + args["registry_url"] = test_mysql_url + args["species"] = ["species1"] + schedule_species_instance = schedule_species(args) + + with pytest.raises(LookupError, match="Database not found for species1, check registry parameters"): + schedule_species_instance.run() + +# Test case to check successful run with run_all parameter +def test_successful_run_all(schedule_species: ScheduleSpecies, pytestconfig): + # Create a ScheduleSpecies instance + test_mysql_url = pytestconfig.getoption("test_db_url") + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = DEFAULT_ARGS.copy() + args["registry_url"] = test_mysql_url + args["run_all"] = True + args["dataflow_output_path"] = test_scratch_path + schedule_species_instance = schedule_species(args) + + # Create the dbs in the registry + dbs = { + "bos_taurus_core_999_1": {"release": 999}, + "danio_rerio_core_999_1": {"release": 999}, + "equus_caballus_core_999_1": {"release": 999}, + "homo_sapiens_core_998_1": {"release": 998}, + } + created_dbs = create_dbs_in_registry(test_mysql_url, dbs) + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json") + clean_url = clean_registry_url(test_mysql_url) + try: + # Run the ScheduleSpecies instance + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"}, + {"species_name": "danio_rerio", "species_db": f"{clean_url}/danio_rerio_core_999_1"}, + {"species_name": "equus_caballus", "species_db": f"{clean_url}/equus_caballus_core_999_1"} + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup the dbs in the registry + cleanup_dbs_in_registry(test_mysql_url, created_dbs) + + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) + +# Test case to check successful run with run_all parameter with db_prefix set +def test_successful_run_all_prefix(schedule_species: ScheduleSpecies, pytestconfig): + # Create a ScheduleSpecies instance + test_mysql_url = pytestconfig.getoption("test_db_url") + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = DEFAULT_ARGS.copy() + args["registry_url"] = test_mysql_url + args["run_all"] = True + args["db_prefix"] = "testprefix" + args["dataflow_output_path"] = test_scratch_path + schedule_species_instance = schedule_species(args) + + # Create the dbs in the registry + dbs = { + "bos_taurus_core_999_1": {"release": 999}, + "danio_rerio_core_999_1": {"release": 999}, + "equus_caballus_core_999_1": {"release": 999}, + "homo_sapiens_core_998_1": {"release": 998}, + "testprefix_homo_sapiens_core_999_1": {"release": 999}, + } + created_dbs = create_dbs_in_registry(test_mysql_url, dbs) + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json") + clean_url = clean_registry_url(test_mysql_url) + try: + # Run the ScheduleSpecies instance + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + {"species_name": "homo_sapiens", "species_db": f"{clean_url}/testprefix_homo_sapiens_core_999_1"}, + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup the dbs in the registry + cleanup_dbs_in_registry(test_mysql_url, created_dbs) + + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) + +# Test case to check successful run with specified species +def test_successful_run_species(schedule_species: ScheduleSpecies, pytestconfig): + # Create a ScheduleSpecies instance + test_mysql_url = pytestconfig.getoption("test_db_url") + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = DEFAULT_ARGS.copy() + args["registry_url"] = test_mysql_url + args["dataflow_output_path"] = test_scratch_path + args["species"] = ["bos_taurus", "danio_rerio"] + schedule_species_instance = schedule_species(args) + + # Create the dbs in the registry + dbs = { + "bos_taurus_core_999_1": {"release": 999}, + "danio_rerio_core_999_1": {"release": 999}, + "equus_caballus_core_999_1": {"release": 999}, + "homo_sapiens_core_998_1": {"release": 998}, + } + created_dbs = create_dbs_in_registry(test_mysql_url, dbs) + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json") + clean_url = clean_registry_url(test_mysql_url) + try: + # Run the ScheduleSpecies instance + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file then remove it + expected_content = [ + {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"}, + {"species_name": "danio_rerio", "species_db": f"{clean_url}/danio_rerio_core_999_1"}, + ] + check_dataflow_content(dataflow_file_path, expected_content) + os.remove(dataflow_file_path) + + # Change the antispecies + schedule_species_instance.set_param("antispecies", ["danio_rerio"]) + + # Run the ScheduleSpecies instance again + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"}, + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup the dbs in the registry + cleanup_dbs_in_registry(test_mysql_url, created_dbs) + + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) + +# Test case to check successful run with specified division +def test_successful_run_division(schedule_species: ScheduleSpecies, pytestconfig): + # Create a ScheduleSpecies instance + test_mysql_url = pytestconfig.getoption("test_db_url") + test_scratch_path = pytestconfig.getoption("test_scratch_path") + args = DEFAULT_ARGS.copy() + args["registry_url"] = test_mysql_url + args["dataflow_output_path"] = test_scratch_path + args["division"] = "EnsemblVertebrates" + schedule_species_instance = schedule_species(args) + + # Create the dbs in the registry + dbs = { + "bos_taurus_core_999_1": {"release": 999, "division": "EnsemblVertebrates"}, + "danio_rerio_core_999_1": {"release": 999, "division": "EnsemblVertebrates"}, + "equus_caballus_core_999_1": {"release": 999, "division": "EnsemblVertebrates"}, + "equus_caballus_core_998_1": {"release": 998, "division": "EnsemblVertebrates"}, + "zea_mays_core_999_1": {"release": 999, "division": "EnsemblPlants"}, + } + created_dbs = create_dbs_in_registry(test_mysql_url, dbs) + + dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json") + clean_url = clean_registry_url(test_mysql_url) + try: + # Run the ScheduleSpecies instance + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file then remove it + expected_content = [ + {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"}, + {"species_name": "danio_rerio", "species_db": f"{clean_url}/danio_rerio_core_999_1"}, + {"species_name": "equus_caballus", "species_db": f"{clean_url}/equus_caballus_core_999_1"}, + ] + check_dataflow_content(dataflow_file_path, expected_content) + os.remove(dataflow_file_path) + + # Change the antispecies + schedule_species_instance.set_param("antispecies", ["danio_rerio"]) + + # Run the ScheduleSpecies instance again + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"}, + {"species_name": "equus_caballus", "species_db": f"{clean_url}/equus_caballus_core_999_1"}, + ] + check_dataflow_content(dataflow_file_path, expected_content) + os.remove(dataflow_file_path) + + # Change the division + schedule_species_instance.set_param("division", "EnsemblPlants") + + # Run the ScheduleSpecies instance again + schedule_species_instance.run() + + # Check if the dataflow file is created + assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found" + + # Check the content of the dataflow file + expected_content = [ + {"species_name": "zea_mays", "species_db": f"{clean_url}/zea_mays_core_999_1"}, + ] + check_dataflow_content(dataflow_file_path, expected_content) + finally: + # Cleanup the dbs in the registry + cleanup_dbs_in_registry(test_mysql_url, created_dbs) + + # Cleanup: Remove the dataflow file if it exists + if os.path.exists(dataflow_file_path): + os.remove(dataflow_file_path) \ No newline at end of file From 5c5f862f34c0d3d202830faedf1e7c89affedec1 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 18 Nov 2024 12:57:00 +0000 Subject: [PATCH 04/12] Changes/fixes to the download pipeline --- scripts/xrefs/cleanup_and_split_source.pl | 104 +-- scripts/xrefs/cleanup_source.pl | 268 +++--- src/python/ensembl/common/Params.py | 348 +++++--- src/python/ensembl/production/xrefs/Base.py | 810 ++++++++---------- .../ensembl/production/xrefs/Checksum.py | 26 +- .../production/xrefs/DownloadSource.py | 59 +- .../production/xrefs/EmailNotification.py | 399 ++++----- .../production/xrefs/ScheduleCleanup.py | 33 +- .../production/xrefs/ScheduleDownload.py | 17 +- .../xrefs/config/gencode_sources.json | 204 +++++ .../xrefs/config/xref_all_sources.json | 71 +- .../production/xrefs/config/xref_config.ini | 28 +- 12 files changed, 1231 insertions(+), 1136 deletions(-) create mode 100644 src/python/ensembl/production/xrefs/config/gencode_sources.json diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index cb92281a3..f1ea08be0 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -42,11 +42,11 @@ ); # Check that all mandatory parameters are passed -if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($clean_files)) { - croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --clean_files [--version_file ] [--tax_ids_file ] [--update_mode ] [--log_timestamp ]"; +foreach my $param ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files) { + defined $param or croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --clean_files [--version_file ] [--tax_ids_file ] [--update_mode ] [--log_timestamp ]"; } -if (!defined($update_mode)) {$update_mode = 0;} +$update_mode //= 0; my $log_file; if (defined($log_timestamp)) { @@ -65,18 +65,15 @@ } # Remove last '/' character if it exists -if ($base_path =~ /\/$/) {chop($base_path);} +chop($base_path) if $base_path =~ /\/$/; # Remove / char from source name to access directory -my $clean_name = $source_name; -$clean_name =~ s/\///g; +(my $clean_name = $source_name) =~ s/\///g; -my $output_path = $clean_dir."/".$clean_name; +my $output_path = catdir($clean_dir, $clean_name); # Create needed directories -if (!$update_mode) { - rmtree($output_path); -} +rmtree($output_path) unless $update_mode; make_path($output_path); my $sources_to_remove; @@ -89,11 +86,7 @@ $output_file_name = ($source_name =~ /SPTREMBL/ ? 'uniprot_trembl' : 'uniprot_sprot'); # Set sources to skip in parsing step - my @source_names = ( - 'GO', 'UniGene', 'RGD', 'CCDS', 'IPI', 'UCSC', 'SGD', 'HGNC', 'MGI', 'VGNC', 'Orphanet', - 'ArrayExpress', 'GenomeRNAi', 'EPD', 'Xenbase', 'Reactome', 'MIM_GENE', 'MIM_MORBID', 'MIM', - 'Interpro' - ); + my @source_names = qw(GO UniGene RGD CCDS IPI UCSC SGD HGNC MGI VGNC Orphanet ArrayExpress GenomeRNAi EPD Xenbase Reactome MIM_GENE MIM_MORBID MIM Interpro); $sources_to_remove = join("|", @source_names); } elsif ($source_name =~ /^RefSeq_dna/) { $is_refseq_dna = 1; @@ -109,47 +102,45 @@ my %tax_ids; my ($skipped_species, $added_species) = (0, 0); if ($tax_ids_file && $update_mode) { - open my $fh, '<', $tax_ids_file; + open my $fh, '<', $tax_ids_file or die "Couldn't open tax_ids_file '$tax_ids_file' $!"; chomp(my @lines = <$fh>); close $fh; %tax_ids = map { $_ => 1 } @lines; # Check if any taxonomy IDs already have files - foreach my $tax_id (keys(%tax_ids)) { - my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id); - if (scalar(@tax_files) > 0) { + foreach my $tax_id (keys %tax_ids) { + my @tax_files = glob(catfile($output_path, "**", "**", "**", "**", "$output_file_name-$tax_id")); + if (@tax_files) { $tax_ids{$tax_id} = 0; $skipped_species++; } } # Do nothing if all taxonomy IDs already have files - if ($skipped_species == scalar(keys(%tax_ids))) { + if ($skipped_species == keys %tax_ids) { add_to_log_file($log_file, "All provided tax IDs already have files. Doing nothing."); exit; } } # Get all files for source -my $files_path = $base_path."/".$clean_name; -my @files = glob($files_path."/*"); +my $files_path = catdir($base_path, $clean_name); +my @files = glob(catfile($files_path, "*")); my $out_fh; my $current_species_id; # Process each file -foreach my $input_file_name (@files) { - local $/ = "//\n"; +foreach my $input_file (@files) { + # Skip the release file + next if defined($version_file) && $input_file eq $version_file; - add_to_log_file($log_file, "Splitting up file $input_file_name"); + local $/ = "//\n"; - $input_file_name = basename($input_file_name); - my $input_file = $files_path."/".$input_file_name; + add_to_log_file($log_file, "Splitting up file $input_file"); + my $input_file_name = basename($input_file); my $in_fh; - # Skip the release file - if (defined($version_file) && $input_file eq $version_file) {next;} - - # Open file normally or with zcat for zipped filed + # Open file normally or with zcat for zipped files if ($input_file_name =~ /\.(gz|Z)$/x) { open($in_fh, "zcat $input_file |") or die "Couldn't call 'zcat' to open input file '$input_file' $!"; $output_file_name =~ s/\.[^.]+$//; @@ -167,14 +158,14 @@ my $species_id; if ($is_uniprot) { ($species_id) = $record =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/; - $species_id =~ s/\s// if $species_id; + $species_id =~ s/\s//g if $species_id; } else { ($species_id) = $record =~ /db_xref=.taxon:(\d+)/; } # Only continue with wanted species - next if (!$species_id); - next if ($tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id})); + next unless $species_id; + next if $tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id}); # Clean up data if ($clean_files) { @@ -205,31 +196,27 @@ } } - if (!$skip_data) { - push(@new_record, $line); - } - - $record = join("\n", @new_record); + push(@new_record, $line) unless $skip_data; } + + $record = join("\n", @new_record); } } # Write the record in the appropriate file if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) { - close($out_fh) if (defined($current_species_id)); + close($out_fh) if defined($current_species_id); my $species_id_str = sprintf("%04d", $species_id); my @digits = split('', $species_id_str); - $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]); + $write_path = catdir($output_path, @digits); make_path($write_path); - $write_file = $write_path."/".$output_file_name."-".$species_id; + $write_file = catfile($write_path, "$output_file_name-$species_id"); # Check if creating new file - if (!-e $write_file) { - $added_species++; - } + $added_species++ unless -e $write_file; open($out_fh, '>>', $write_file) or die "Couldn't open output file '$write_file' $!"; @@ -240,42 +227,33 @@ } close($in_fh); - close($out_fh) if $out_fh; } } +close($out_fh) if $out_fh; + add_to_log_file($log_file, "Source $source_name cleaned up"); add_to_log_file($log_file, "$source_name skipped species = $skipped_species"); add_to_log_file($log_file, "$source_name species files created = $added_species"); # Save the clean files directory in source db -my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url); +my ($host, $port, $user, $pass, $source_db) = parse_url($source_db_url); my $dbi = get_dbi($host, $port, $user, $pass, $source_db); -my $update_version_sth = $dbi->prepare("UPDATE IGNORE version set clean_uri=? where source_id=(SELECT source_id FROM source WHERE name=?)"); +my $update_version_sth = $dbi->prepare("UPDATE IGNORE version SET clean_path=? WHERE source_id=(SELECT source_id FROM source WHERE name=?)"); $update_version_sth->execute($output_path, $source_name); $update_version_sth->finish(); sub get_dbi { my ($host, $port, $user, $pass, $dbname) = @_; - my $dbconn; - if (defined $dbname) { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); - } else { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); - } - my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); + my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr); return $dbi; } sub parse_url { my ($url) = @_; my $parsed_url = Nextflow::Utils::parse($url); - my $user = $parsed_url->{'user'}; - my $pass = $parsed_url->{'pass'}; - my $host = $parsed_url->{'host'}; - my $port = $parsed_url->{'port'}; - my $db = $parsed_url->{'dbname'}; - return ($user, $pass, $host, $port, $db); + return @{$parsed_url}{qw(host port user pass dbname)}; } sub add_to_log_file { @@ -284,8 +262,8 @@ sub add_to_log_file { if (defined($log_file)) { my $current_timestamp = strftime "%d-%b-%Y %H:%M:%S", localtime; - open(my $fh, '>>', $log_file); + open(my $fh, '>>', $log_file) or die "Couldn't open log file '$log_file' $!"; print $fh "$current_timestamp | INFO | $message\n"; close($fh); } -} \ No newline at end of file +} diff --git a/scripts/xrefs/cleanup_source.pl b/scripts/xrefs/cleanup_source.pl index 1226e6e1c..07b330717 100644 --- a/scripts/xrefs/cleanup_source.pl +++ b/scripts/xrefs/cleanup_source.pl @@ -37,8 +37,8 @@ ); # Check that all mandatory parameters are passed -if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($skip_download) || !defined($clean_files)) { - croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --skip_download --clean_files [--version_file ] [--log_timestamp ]"; +foreach my $param ($base_path, $source_db_url, $source_name, $clean_dir, $skip_download, $clean_files) { + defined $param or croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --skip_download --clean_files [--version_file ] [--log_timestamp ]"; } my $log_file; @@ -50,176 +50,160 @@ add_to_log_file($log_file, "CleanupSource starting for source $source_name"); } -# Do nothing if not cleaning files, not a uniprot or refseq source, or no new download -if ($clean_files && ($source_name =~ /^Uniprot/ || $source_name =~ /^RefSeq_/)) { - # Remove last '/' character if it exists - if ($base_path =~ /\/$/) {chop($base_path);} +# Do nothing if not cleaning files or if not a uniprot or refseq source +if (!$clean_files || ($source_name !~ /^Uniprot/ && $source_name !~ /^RefSeq_/)) { + add_to_log_file($log_file, "Provided source name is invalid. Can only clean up and split Uniprot or RefSeq files."); + exit; +} - # Remove / char from source name to access directory - my $clean_name = $source_name; - $clean_name =~ s/\///g; +# Remove last '/' character if it exists +chop($base_path) if $base_path =~ /\/$/; - my $output_path = $clean_dir."/".$clean_name; - my $update_clean_uri = 0; +# Remove / char from source name to access directory +(my $clean_name = $source_name) =~ s/\///g; - # If not a new download, check if clean files exist - if ($skip_download) { - if (-d $output_path) { - $update_clean_uri = 1 - } - } else { - # Create needed directories - make_path($output_path); +my $output_path = catdir($clean_dir, $clean_name); +my $update_clean_uri = 0; +# If not a new download, check if clean files exist +if ($skip_download) { + if (-d $output_path) { $update_clean_uri = 1; + } +} else { + # Create needed directories + make_path($output_path); + $update_clean_uri = 1; + + my $sources_to_remove; + my ($is_uniprot, $is_refseq_dna, $is_refseq_peptide) = (0, 0, 0); + my $file_size = 0; + + # Set sources to skip in parsing step (uniprot only) + if ($source_name =~ /^Uniprot/) { + $is_uniprot = 1; + my @source_names = qw(GO UniGene RGD CCDS IPI UCSC SGD HGNC MGI VGNC Orphanet ArrayExpress GenomeRNAi EPD Xenbase Reactome MIM_GENE MIM_MORBID MIM Interpro); + $sources_to_remove = join("|", @source_names); + $file_size = 200000; + } elsif ($source_name =~ /^RefSeq_dna/) { + $is_refseq_dna = 1; + } elsif ($source_name =~ /^RefSeq_peptide/) { + $is_refseq_peptide = 1; + } else { + croak "Unknown file type $source_name"; + } - my $sources_to_remove; - my ($is_uniprot, $is_refseq_dna, $is_refseq_peptide) = (0, 0, 0); - my $file_size = 0; - - # Set sources to skip in parsing step (uniprot only) - if ($source_name =~ /^Uniprot/) { - $is_uniprot = 1; - my @source_names = ( - 'GO', 'UniGene', 'RGD', 'CCDS', 'IPI', 'UCSC', 'SGD', 'HGNC', 'MGI', 'VGNC', 'Orphanet', - 'ArrayExpress', 'GenomeRNAi', 'EPD', 'Xenbase', 'Reactome', 'MIM_GENE', 'MIM_MORBID', 'MIM', - 'Interpro' - ); - $sources_to_remove = join("|", @source_names); - $file_size = 200000; - } elsif ($source_name =~ /^RefSeq_dna/) { - $is_refseq_dna = 1; - } elsif ($source_name =~ /^RefSeq_peptide/) { - $is_refseq_peptide = 1; - } else { - croak "Unknown file type $source_name"; - } + # Get all files for source + my $files_path = catdir($base_path, $clean_name); + my @files = glob(catfile($files_path, "*")); - # Get all files for source - my $files_path = $base_path."/".$clean_name; - my @files = `ls $files_path`; - foreach my $file_name (@files) { - $file_name =~ s/\n//; - my $file = $files_path."/".$file_name; + # Process each file + foreach my $input_file (@files) { + # Skip the release file + next if defined($version_file) && $input_file eq $version_file; - # Skip the release file - if (defined($version_file) && $file eq $version_file) {next;} + add_to_log_file($log_file, "Cleaning up file $input_file"); + my ($in_fh, $out_fh); + my $input_file_name = basename($input_file); + my $output_file = $input_file_name; - my ($in_fh, $out_fh); - my $output_file = $file_name; + # Open file normally or with zcat for zipped files + if ($input_file_name =~ /\.(gz|Z)$/x) { + open($in_fh, "zcat $input_file |") or die "Couldn't call 'zcat' to open input file '$input_file' $!"; + $output_file =~ s/\.[^.]+$//; + } else { + open($in_fh, '<', $input_file) or die "Couldn't open file input '$input_file' $!"; + } - # Open file normally or with zcat for zipped filed - if ($file_name =~ /\.(gz|Z)$/x) { - open($in_fh, "zcat $file |") - or die "Couldn't call 'zcat' to open input file '$file' $!"; + # Only start cleaning up if could get filehandle + my $count = 0; + my $file_count = 1; + if (defined($in_fh)) { + if ($is_uniprot) { + local $/ = "//\n"; + + my $write_file = catfile($output_path, "$output_file-$file_count"); + open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!"; + + # Read full records + while (my $record = $in_fh->getline()) { + # Remove unused data + $record =~ s/\nR(N|P|X|A|T|R|L|C|G)\s{3}.*//g; # Remove references lines + $record =~ s/\nCC(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCT$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set specific caution comment to temporary + $record =~ s/\nCC\s{3}.*//g; # Remove comments + $record =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment + $record =~ s/\nFT\s{3}.*//g; # Remove feature coordinates + $record =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing + + # Added lines that we do need into output + print $out_fh $record; + + # Check how many lines have been processed and write to new file if size exceeded + $count++; + if ($count > $file_size) { + close($out_fh); + $file_count++; + $write_file = catfile($output_path, "$output_file-$file_count"); + open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!"; + $count = 0; + } + } - $output_file =~ s/\.[^.]+$//; + close($in_fh); + close($out_fh); } else { - open($in_fh, '<', $file) - or die "Couldn't open file input '$file' $!"; - } - - # Only start cleaning up if could get filehandle - my $count = 0; - my $file_count = 1; - if (defined($in_fh)) { - if ($is_uniprot) { - local $/ = "//\n"; - - my $write_file = $output_path."/".$output_file . "-$file_count"; - open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!"; - - # Read full records - while ($_ = $in_fh->getline()) { - # Remove unused data - $_ =~ s/\nR(N|P|X|A|T|R|L|C|G)\s{3}.*//g; # Remove references lines - $_ =~ s/\nCC(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCT$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set specific caution comment to temporary - $_ =~ s/\nCC\s{3}.*//g; # Remove comments - $_ =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment - $_ =~ s/\nFT\s{3}.*//g; # Remove feature coordinates - $_ =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing - - # Added lines that we do need into output - print $out_fh $_; - - # Check how many lines have been processed and write to new file if size exceeded - $count++; - if ($count > $file_size) { - close($out_fh); - $file_count++; - $write_file = $output_path."/".$output_file . "-$file_count"; - open($out_fh, '>', $write_file) - or die "Couldn't open output file '$write_file' $!"; - $count = 0; + $output_file = catfile($output_path, $output_file); + open($out_fh, '>', $output_file) or die "Couldn't open output file '$output_file' $!"; + + # Remove unused data + my $skip_data = 0; + while (my $line = <$in_fh>) { + if ($is_refseq_dna) { + if ($line =~ /^REFERENCE/ || $line =~ /^COMMENT/ || $line =~ /^\s{5}exon/ || $line =~ /^\s{5}misc_feature/ || $line =~ /^\s{5}variation/) { + $skip_data = 1; + } elsif ($line =~ /^\s{5}source/ || $line =~ /^ORIGIN/) { + $skip_data = 0; } - } - - close($in_fh); - close($out_fh); - } else { - $output_file = $output_path."/".$output_file; - open($out_fh, '>', $output_file) or die "Couldn't open output file '$output_file' $!"; - - # Remove unuused data - my $skip_data = 0; - while (<$in_fh>) { - if ($is_refseq_dna) { - if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}exon/ || $_ =~ /^\s{5}misc_feature/ || $_ =~ /^\s{5}variation/) { - $skip_data = 1; - } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^ORIGIN/) { - $skip_data = 0; - } - } elsif ($is_refseq_peptide) { - if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}Protein/) { - $skip_data = 1; - } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^\s{5}CDS/ || $_ =~ /^ORIGIN/) { - $skip_data = 0; - } + } elsif ($is_refseq_peptide) { + if ($line =~ /^REFERENCE/ || $line =~ /^COMMENT/ || $line =~ /^\s{5}Protein/) { + $skip_data = 1; + } elsif ($line =~ /^\s{5}source/ || $line =~ /^\s{5}CDS/ || $line =~ /^ORIGIN/) { + $skip_data = 0; } - - if (!$skip_data) {print $out_fh $_;} } - close($in_fh); - close($out_fh); + print $out_fh $line unless $skip_data; } + + close($in_fh); + close($out_fh); } } - - add_to_log_file($log_file, "Source $source_name cleaned up"); } - # Save the clean files directory in source db - if ($update_clean_uri) { - my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url); - my $dbi = get_dbi($host, $port, $user, $pass, $source_db); - my $update_version_sth = $dbi->prepare("UPDATE IGNORE version set clean_uri=? where source_id=(SELECT source_id FROM source WHERE name=?)"); - $update_version_sth->execute($output_path, $source_name); - $update_version_sth->finish(); - } + add_to_log_file($log_file, "Source $source_name cleaned up"); +} + +# Save the clean files directory in source db +if ($update_clean_uri) { + my ($host, $port, $user, $pass, $source_db) = parse_url($source_db_url); + my $dbi = get_dbi($host, $port, $user, $pass, $source_db); + my $update_version_sth = $dbi->prepare("UPDATE IGNORE version SET clean_path=? WHERE source_id=(SELECT source_id FROM source WHERE name=?)"); + $update_version_sth->execute($output_path, $source_name); + $update_version_sth->finish(); } sub get_dbi { my ($host, $port, $user, $pass, $dbname) = @_; - my $dbconn; - if (defined $dbname) { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); - } else { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); - } - my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); + my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr); return $dbi; } sub parse_url { my ($url) = @_; my $parsed_url = Nextflow::Utils::parse($url); - my $user = $parsed_url->{'user'}; - my $pass = $parsed_url->{'pass'}; - my $host = $parsed_url->{'host'}; - my $port = $parsed_url->{'port'}; - my $db = $parsed_url->{'dbname'}; - return ($user, $pass, $host, $port, $db); + return @{$parsed_url}{qw(host port user pass dbname)}; } sub add_to_log_file { @@ -228,7 +212,7 @@ sub add_to_log_file { if (defined($log_file)) { my $current_timestamp = strftime "%d-%b-%Y %H:%M:%S", localtime; - open(my $fh, '>>', $log_file); + open(my $fh, '>>', $log_file) or die "Couldn't open log file '$log_file' $!"; print $fh "$current_timestamp | INFO | $message\n"; close($fh); } diff --git a/src/python/ensembl/common/Params.py b/src/python/ensembl/common/Params.py index b7a163a14..9d1a7b05f 100644 --- a/src/python/ensembl/common/Params.py +++ b/src/python/ensembl/common/Params.py @@ -18,14 +18,14 @@ import re import json import argparse +import os -from typing import Dict, Any +from typing import Dict, Any, Optional, Type sys.tracebacklimit = 0 - class Params: - def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None: + def __init__(self, params: Optional[Dict[str, Any]] = None, parse_dataflow_json: bool = True) -> None: """Params constructor. Parameters @@ -35,13 +35,8 @@ def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = Tr parse_dataflow_json: bool, optional Specifies whether to parse an option called 'dataflow' in the provided options (default is True) """ - if params is None: - params = {} - - if params: - self._params = params - else: - self._params = {} + self._params = params if params is not None else {} + if not params: self.parse_argv_params(parse_dataflow_json) def parse_argv_params(self, parse_dataflow_json: bool = True) -> None: @@ -56,8 +51,7 @@ def parse_argv_params(self, parse_dataflow_json: bool = True) -> None: args = sys.argv[1:] # Extract param names from command line - r = re.compile(r"^--") - param_names = list(filter(r.match, args)) + param_names = [arg for arg in args if arg.startswith("--")] parser = argparse.ArgumentParser() for name in param_names: @@ -68,160 +62,240 @@ def parse_argv_params(self, parse_dataflow_json: bool = True) -> None: if param_name == "dataflow" and parse_dataflow_json: dataflow_params = json.loads(getattr(params, param_name)) for name, value in dataflow_params.items(): - self.param(name, value) + self.set_param(name, value) else: - self.param(param_name, getattr(params, param_name)) - - def param(self, name: str, new_value: Any = None, options: Dict[str, Any] = None) -> Any: - """Gets or sets a parameter value. + self.set_param(param_name, getattr(params, param_name)) - Parameters - ---------- - name: str - The name of the paramater - new_value: any, optional - The value to set the parameter to (default is None) - options: dict, optional - Extra options, including: - - default: The default value to use if parameter has no value (sets the parameter value to this) - - type: The type of the parameter value, used to check if value is valid - - Returns - ------- - The value of the parameter with provided name. - - Raises - ------ - AttributeError - If no parameter name was passed. - """ + def get_param(self, name: str, options: Optional[Dict[str, Any]] = None) -> Any: if not name: raise AttributeError("You must supply a parameter name") if options is None: options = {} - value = None - - if new_value is not None: - self._params[name] = new_value - value = new_value - else: - value = self._params.get(name) - if value is None and options.get("default") is not None: - default = options["default"] - self._params[name] = default - value = default - - if options.get("type"): - return self.check_type(name, value, options["type"]) - - return value - - def param_required(self, name: str, options: Dict[str, Any] = None) -> Any: - """Gets a parameter value, raising an error if no value is found. + value = self._params.get(name) + if value is None: + if "default" in options: + value = options["default"] + elif "required" in options and options["required"]: + raise AttributeError(f"Parameter '{name}' is required but has no value") - Parameters - ---------- - name: str - The name of th parameter - options: dict, optional - Extra options, including: - - default: The default value to use if parameter has no value (sets the parameter value to this) - - type: The type of the parameter value, used to check if value is valid + return self.set_param(name, value, options) - Returns - ------- - The value of the parameter with provided name. + def set_param(self, name: str, value: Any, options: Optional[Dict[str, Any]] = None) -> Any: + if not name: + raise AttributeError("You must supply a parameter name") + if options is None: + options = {} - Raises - ------ - AttributeError - If no value is found for the required paramater. - """ - value = self.param(name, None, options) + if "type" in options: + value = self.check_type(name, value, options["type"]) - if value is None: - raise AttributeError(f"Parameter '{name}' is required but has no value") + self._params[name] = value return value - def check_type(self, name: str, value: Any, value_type: str) -> Any: - """Checks if the parameter value provided is valid. - For specific types, this function can change the parameter value. + # def param(self, name: str, new_value: Any = None, options: Optional[Dict[str, Any]] = None) -> Any: + # """Gets or sets a parameter value. + + # Parameters + # ---------- + # name: str + # The name of the parameter + # new_value: any, optional + # The value to set the parameter to (default is None) + # options: dict, optional + # Extra options, including: + # - default: The default value to use if parameter has no value (sets the parameter value to this) + # - type: The type of the parameter value, used to check if value is valid + + # Returns + # ------- + # The value of the parameter with provided name. + + # Raises + # ------ + # AttributeError + # If no parameter name was passed. + # """ + # if not name: + # raise AttributeError("You must supply a parameter name") + # if options is None: + # options = {} + + # if new_value is not None: + # self._params[name] = new_value + # value = new_value + # else: + # value = self._params.get(name) + # if value is None and "default" in options: + # value = options["default"] + # self._params[name] = value + + # if "type" in options: + # return self.check_type(name, value, options["type"]) + + # return value + + # def param_required(self, name: str, options: Optional[Dict[str, Any]] = None) -> Any: + # """Gets a parameter value, raising an error if no value is found. + + # Parameters + # ---------- + # name: str + # The name of the parameter + # options: dict, optional + # Extra options, including: + # - default: The default value to use if parameter has no value (sets the parameter value to this) + # - type: The type of the parameter value, used to check if value is valid + + # Returns + # ------- + # The value of the parameter with provided name. + + # Raises + # ------ + # AttributeError + # If no value is found for the required parameter. + # """ + # value = self.param(name, None, options) + + # if value is None: + # raise AttributeError(f"Parameter '{name}' is required but has no value") + + # return value + + def check_type(self, name: str, value: Any, value_type: Type) -> Any: + """Checks if the parameter value is of the expected type and attempts conversion if necessary. Parameters ---------- name: str The name of the parameter - value: any + value: Any The value of the parameter - value_type: str - The type of the parameter value. Accepted types: - - hash, dict, or dictionary - - array or list - - int or integer - - bool or boolean - - str or string + value_type: Type + The expected type of the parameter (e.g., `int`, `str`, `bool`) Returns ------- - None if no value is found, or the new value of the parameter with provided name. + The value of the parameter with provided name, converted to the correct type if necessary. Raises ------ AttributeError - If no parameter name is provided. - If parameter value is not valid. + If the parameter name is missing or the value cannot be converted to the specified type. """ if not name: raise AttributeError("You must supply a parameter name") if value is None: return - value_type = value_type.lower() - error, update = False, True - new_value = None - - if value_type in ["hash", "dict", "dictionary"] and not isinstance(value, dict): - error = True - elif value_type in ["array", "list"] and not isinstance(value, list): - # Try to split by commas - if re.search(",", value): - new_value = value.split(",") - else: - new_value = [value] - elif value_type in ["int", "integer"] and not isinstance(value, int): - # Try to make it an integer - try: - new_value = int(value) - except ValueError: - error = True - elif value_type in ["bool", "boolean"] and not isinstance(value, bool): - # Try to make it a boolean + # Special cases first + if value_type is list: + if isinstance(value, str): + # Split the string by commas if present, otherwise wrap it in a list + value = re.sub(r"\s*,\s*", ",", value) + value = value.split(",") if "," in value else [value] + elif not isinstance(value, list): + # If value is not a list and not a string, raise an error + raise AttributeError(f"Parameter '{name}' has an invalid value '{value}'. Expected type list") + elif value_type is bool: if isinstance(value, int): - new_value = bool(value) - elif isinstance(value, str) and value in ["True", "False"]: - new_value = bool(value) - elif value in ["0", "1", 0, 1]: - new_value = bool(int(value)) - else: - error = True - elif value_type in ["str", "string"] and not isinstance(value, str): - new_value = str(value) - else: - update = False - - if error: - raise AttributeError( - f"Parameter '{name}' has an invalid value '{value}'. Must be of type {value_type}" - ) - - if update: - self.param(name, new_value) - value = new_value + value = bool(value) + elif isinstance(value, str): + if value in ["True", "False"]: + value = value == "True" + elif value in ["0", "1"]: + value = bool(int(value)) + elif not isinstance(value, bool): + raise AttributeError(f"Parameter '{name}' has an invalid value '{value}'. Expected type bool") + + # General type checking for other types + # if not isinstance(value, value_type): + try: + value = value_type(value) # Attempt conversion + except (ValueError, TypeError): + raise AttributeError(f"Parameter '{name}' has an invalid value '{value}'. Expected type {value_type.__name__}") return value + # def check_type(self, name: str, value: Any, value_type: str) -> Any: + # """Checks if the parameter value provided is valid. + # For specific types, this function can change the parameter value. + + # Parameters + # ---------- + # name: str + # The name of the parameter + # value: any + # The value of the parameter + # value_type: str + # The type of the parameter value. Accepted types: + # - hash, dict, or dictionary + # - array or list + # - int or integer + # - bool or boolean + # - str or string + + # Returns + # ------- + # None if no value is found, or the new value of the parameter with provided name. + + # Raises + # ------ + # AttributeError + # If no parameter name is provided. + # If parameter value is not valid. + # """ + # if not name: + # raise AttributeError("You must supply a parameter name") + # if value is None: + # return + + # value_type = value_type.lower() + # error, update = False, True + # new_value = None + + # if value_type in ["hash", "dict", "dictionary"] and not isinstance(value, dict): + # error = True + # elif value_type in ["array", "list"] and not isinstance(value, list): + # # Try to split by commas + # if isinstance(value, str) and "," in value: + # new_value = value.split(",") + # else: + # new_value = [value] + # elif value_type in ["int", "integer"] and not isinstance(value, int): + # # Try to make it an integer + # try: + # new_value = int(value) + # except ValueError: + # error = True + # elif value_type in ["bool", "boolean"] and not isinstance(value, bool): + # # Try to make it a boolean + # if isinstance(value, int): + # new_value = bool(value) + # elif isinstance(value, str) and value in ["True", "False"]: + # new_value = value == "True" + # elif value in ["0", "1", 0, 1]: + # new_value = bool(int(value)) + # else: + # error = True + # elif value_type in ["str", "string"] and not isinstance(value, str): + # new_value = str(value) + # else: + # update = False + + # if error: + # raise AttributeError( + # f"Parameter '{name}' has an invalid value '{value}'. Must be of type {value_type}" + # ) + + # if update: + # self.param(name, new_value) + # value = new_value + + # return value + def write_output(self, suffix: str, params: Dict[str, Any]) -> None: """Appends data to the dataflow json file (passed into next pipeline process). @@ -233,11 +307,19 @@ def write_output(self, suffix: str, params: Dict[str, Any]) -> None: The data to append into the file """ # Remove null params - params = {k: v for k, v in params.items() if v is not None} + output_params = {k: v for k, v in params.items() if v is not None} - with open(f"dataflow_{suffix}.json", "a") as fh: - json.dump(params, fh) - fh.write("\n") + dataflow_file = f"dataflow_{suffix}.json" + dataflow_output_path = self.get_param("dataflow_output_path", {"type": str}) + if dataflow_output_path: + dataflow_file = os.path.join(dataflow_output_path, dataflow_file) + + with open(dataflow_file, "a") as fh: + if output_params: + json.dump(output_params, fh) + fh.write("\n") + else: + fh.write("") def write_all_output(self, suffix: str) -> None: """Appends all of the parameters in the object into the dataflow json file. diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py index 3a59abfc0..04aad4971 100644 --- a/src/python/ensembl/production/xrefs/Base.py +++ b/src/python/ensembl/production/xrefs/Base.py @@ -21,119 +21,72 @@ import fnmatch import gzip import importlib -import wget +import wget # type: ignore import threading -import json import logging -import time import random -import csv -import subprocess -import unicodedata -from sqlalchemy import create_engine, select, insert, update, text, func, and_, delete -from sqlalchemy.engine.url import make_url, URL +from sqlalchemy import create_engine, select, text +from sqlalchemy.dialects.mysql import insert +from sqlalchemy.engine.url import make_url from sqlalchemy.engine import Engine, Connection -from sqlalchemy.orm import aliased from sqlalchemy_utils import database_exists, create_database, drop_database from urllib.parse import urlparse from ftplib import FTP from itertools import groupby from configparser import ConfigParser from datetime import datetime -from pyspark import SparkConf -from pyspark.sql import SparkSession from typing import IO, List, Dict, Any, Iterator, Optional from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper from ensembl.xrefs.xref_source_db_model import ( Base as XrefSourceDB, - Source as SourceSORM, - Version as VersionORM, - ChecksumXref as ChecksumXrefSORM, + Source as SourceSORM ) from ensembl.xrefs.xref_update_db_model import ( Base as XrefUpdateDB, Source as SourceUORM, SourceURL as SourceURLORM, - Xref as XrefUORM, - PrimaryXref as PrimaryXrefORM, - DependentXref as DependentXrefUORM, - CoordinateXref as CoordinateXrefORM, - GeneDirectXref as GeneDirectXrefORM, - TranscriptDirectXref as TranscriptDirectXrefORM, - TranslationDirectXref as TranslationDirectXrefORM, - Synonym as SynonymORM, - Pairs as PairsORM, - Species as SpeciesORM, - MappingJobs as MappingJobsORM, - Mapping as MappingORM, + Species as SpeciesORM ) -from ensembl.core.models import ( - Meta as MetaCORM, - Analysis as AnalysisORM, - AnalysisDescription as AnalysisDescriptionORM, - SeqRegion as SeqRegionORM, - CoordSystem as CoordSystemORM, - Dna as DnaORM, - Gene as GeneORM, - Transcript as TranscriptORM, - Translation as TranslationORM, - Exon as ExonORM, - ExonTranscript as ExonTranscriptORM, - SupportingFeature as SupportingFeatureORM, - DnaAlignFeature as DnaAlignFeatureORM, - AttribType as AttribTypeORM, - TranscriptAttrib as TranscriptAttribORM, - SeqRegionAttrib as SeqRegionAttribORM, - Xref as XrefCORM, - DependentXref as DependentXrefCORM, - ExternalDb as ExternalDbORM, - ObjectXref as ObjectXrefCORM, -) +from ensembl.core.models import Meta as MetaCORM from ensembl.common.Params import Params - class Base(Params): """Class to represent the base of xref modules. Inherits the Params class.""" - def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None: - """Calls the parent __init__ then sets some specific parameters. + def __init__(self, params: Optional[Dict[str, Any]] = None, parse_dataflow_json: Optional[bool] = True, testing: bool = False) -> None: + """ + Initialize the Base class with specific parameters. Parameters ---------- - params: dict, optional - The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) - parse_dataflow_json: bool, optional - Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + params: Optional[Dict[str, Any]] + Initial parameters for the object. If provided, command-line parameters will not be parsed (default is None). + parse_dataflow_json: Optional[bool] + Whether to parse an option called 'dataflow' in the provided options (default is True). """ super().__init__(params, parse_dataflow_json) - self.param( - "metasearch_url", "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch" - ) + self.set_param("metasearch_url", "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch") # Initialize the logfile for this run (except for the Alignment module) module_name = self.__class__.__name__ - if module_name != "Alignment": - if self.param("log_timestamp"): - current_timestamp = self.param("log_timestamp") - else: - current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + if module_name != "Alignment" and not testing: + current_timestamp = self.get_param("log_timestamp", {"default": datetime.now().strftime("%Y%m%d_%H%M%S"), "type": str}) log_path = os.path.join( - self.param_required("base_path"), "logs", current_timestamp + self.get_param("base_path", {"required": True}), "logs", current_timestamp ) - if not os.path.exists(log_path): - os.makedirs(log_path, exist_ok=True) + os.makedirs(log_path, exist_ok=True) log_file = os.path.join( log_path, - "tmp_logfile_" + module_name + "_" + str(random.randint(0, 5000)), + f"tmp_logfile_{module_name}_{random.randint(0, 5000)}", ) self._log_file = log_file @@ -155,43 +108,49 @@ def create_source_db(self, source_url: str, reuse_db_if_present: bool) -> None: Parameters ---------- source_url: str - The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] + The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]. reuse_db_if_present: bool - If set to False, the database defined by provided URL will be dropped before creating a new one + If set to True, the existing database will be reused if present. """ url = make_url(source_url) engine = create_engine(url, isolation_level="AUTOCOMMIT") - if url.database and reuse_db_if_present: + if reuse_db_if_present and database_exists(engine.url): + logging.info(f"Database {url.database} already exists and reuse_db_if_present is True. Skipping creation.") return if database_exists(engine.url): + logging.info(f"Dropping existing database {url.database}.") drop_database(engine.url) + + logging.info(f"Creating new database {url.database}.") create_database(engine.url) XrefSourceDB.metadata.create_all(engine) + logging.info(f"Database {url.database} created successfully.") def download_file(self, file: str, base_path: str, source_name: str, extra_args: Dict[str, Any]) -> str: - """Downloads an xref file and saves into provided space. + """Downloads an xref file and saves it into the provided space. Parameters ---------- file: str - The URL of the file to download. Acceptable URL schemes: ftp, http, and https + The URL of the file to download. Acceptable URL schemes: ftp, http, and https. base_path: str - The path to save the downloaded file into + The path to save the downloaded file into. source_name: str - The xref source name - extra_args: dict + The xref source name. + extra_args: Dict[str, Any] Extra options, including: - - skip_download_if_file_present: If set to True, file is only downloaded if does not exist - - db: The type of external db for the xref source (only relevent here if equal to 'checksum') - - release: If set to 'version', then this is a version file download - - rel_number: The URL used to retrieve the release number (only for RefSeq) - - catalog: The URL used to retrieve the release catalog (only for RefSeq) + - skip_download_if_file_present: If set to True, file is only downloaded if it does not exist. + - db: The type of external db for the xref source (only relevant here if equal to 'checksum'). + - release: If set to 'version', then this is a version file download. + - rel_number: The URL used to retrieve the release number (only for RefSeq). + - catalog: The URL used to retrieve the release catalog (only for RefSeq). Returns ------- - The path of the downloaded file. + str + The path of the downloaded file. Raises ------ @@ -200,63 +159,46 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args: AttributeError If file URL scheme is invalid. """ - # Create uri object and get scheme uri = urlparse(file) if not uri.scheme: return file - # Get extra parameters - skip_download_if_file_present = ( - extra_args.get("skip_download_if_file_present") or False - ) + skip_download_if_file_present = extra_args.get("skip_download_if_file_present", False) db = extra_args.get("db") release = extra_args.get("release") rel_number = extra_args.get("rel_number") catalog = extra_args.get("catalog") - # Create file download path - orig_source_name = source_name - source_name = re.sub(r"\/", "", source_name) - dest_dir = os.path.join(base_path, source_name) - if db and db == "checksum": - dest_dir = os.path.join(base_path, "Checksum") - if not os.path.exists(dest_dir): - os.makedirs(dest_dir, exist_ok=True) + source_name_clean = re.sub(r"\/", "", source_name) + dest_dir = os.path.join(base_path, "Checksum" if db == "checksum" else source_name_clean) + os.makedirs(dest_dir, exist_ok=True) - file_path = "" + def download_via_http(file_url: str, dest_path: str) -> None: + if not os.path.exists(dest_path) or not skip_download_if_file_present: + if os.path.exists(dest_path): + os.remove(dest_path) + wget.download(file_url, dest_path) + logging.info(f"{source_name} file downloaded via HTTP: {dest_path}") + else: + logging.info(f"{source_name} file already exists, skipping download ({dest_path})") # If file is in local ftp, copy from there if re.search("ftp.ebi.ac.uk", file): - # Construct local path - local_file = file - local_file = re.sub( - "https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file - ) - - # Check if local file exists + local_file = re.sub("https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", file) if os.path.exists(local_file): file_path = os.path.join(dest_dir, os.path.basename(uri.path)) - if db and db == "checksum": - file_path = os.path.join( - dest_dir, f"{source_name}-{os.path.basename(uri.path)}" - ) + if db == "checksum": + file_path = os.path.join(dest_dir, f"{source_name_clean}-{os.path.basename(uri.path)}") if not (skip_download_if_file_present and os.path.exists(file_path)): shutil.copy(local_file, file_path) # Check if copy was successful if os.path.exists(file_path): - logging.info( - f"{orig_source_name} file copied from local FTP: {file_path}" - ) - # if release: - # return file_path - # return os.path.dirname(file_path) + logging.info(f"{source_name} file copied from local FTP: {file_path}") return file_path else: - logging.info( - f"{orig_source_name} file already exists, skipping download ({file_path})" - ) + logging.info(f"{source_name} file already exists, skipping download ({file_path})") # Handle Refseq files if re.search("RefSeq", source_name) and rel_number and catalog and not release: @@ -267,73 +209,31 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args: # Get list of files in release catalog catalog = re.sub(r"\*", str(release_number), catalog) - files_list = requests.get(catalog).text - refseq_files = files_list.split("\n") + refseq_files = requests.get(catalog).text.split("\n") files_to_download = [] - # Download each refseq file for refseq_file in refseq_files: - if not refseq_file: - continue - checksum, filename = refseq_file.split("\t") - - # Only interested in files matching pattern - if not fnmatch.fnmatch(filename, os.path.basename(uri.path)): - continue - if re.search("nonredundant_protein", filename) or re.search( - "wp_protein", filename - ): - continue - - file_path = os.path.join(dest_dir, os.path.basename(filename)) - if os.path.exists(file_path): - if skip_download_if_file_present: - logging.info( - f"{orig_source_name} file already exists, skipping download ({file_path})" - ) - continue - os.remove(file_path) - - file_url = os.path.join(os.path.dirname(file), filename) - files_to_download.append({"url": file_url, "path": file_path}) - logging.info( - f"{orig_source_name} file downloaded via HTTP: {file_path}" - ) + if refseq_file: + checksum, filename = refseq_file.split("\t") + + # Only interested in files matching pattern and not non-redundant or wp_protein + if fnmatch.fnmatch(filename, os.path.basename(uri.path)) and not re.search("nonredundant_protein|wp_protein", filename): + file_path = os.path.join(dest_dir, os.path.basename(filename)) + if os.path.exists(file_path): + if skip_download_if_file_present: + logging.info(f"{source_name} file already exists, skipping download ({file_path})") + continue + os.remove(file_path) + + file_url = os.path.join(os.path.dirname(file), filename) + files_to_download.append({"url": file_url, "path": file_path, "type": source_name}) self.refseq_multithreading(files_to_download) elif uri.scheme == "ftp": - ftp = FTP(uri.netloc) - ftp.login("anonymous", "-anonymous@") - ftp.cwd(os.path.dirname(uri.path)) - remote_files = ftp.nlst() - - # Download files in ftp server - for remote_file in remote_files: - # Only interested in files matching pattern - if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)): - continue - - remote_file = re.sub(r"\n", "", remote_file) - file_path = os.path.join(dest_dir, os.path.basename(remote_file)) - if db and db == "checksum": - file_path = os.path.join( - dest_dir, f"{source_name}-{os.path.basename(remote_file)}" - ) - - if not (skip_download_if_file_present and os.path.exists(file_path)): - ftp.retrbinary("RETR " + remote_file, open(file_path, "wb").write) - logging.info( - f"{orig_source_name} file downloaded via FTP: {file_path}" - ) - else: - logging.info( - f"{orig_source_name} file already exists, skipping download ({file_path})" - ) - ftp.close() - elif uri.scheme == "http" or uri.scheme == "https": - # This is the case for the release file + file_path = self.download_via_ftp(file, dest_dir, db, source_name, skip_download_if_file_present) + elif uri.scheme in ["http", "https"]: + # This is the case for the RefSeq release file if re.search("RefSeq", source_name) and rel_number and release: - # Get current release number release_number = requests.get(rel_number).json() if not release_number: raise LookupError(f"No release number in {rel_number}") @@ -342,42 +242,51 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args: uri = urlparse(file) file_path = os.path.join(dest_dir, os.path.basename(uri.path)) - if db and db == "checksum": - file_path = os.path.join( - dest_dir, f"{source_name}-{os.path.basename(uri.path)}" - ) + if db == "checksum": + file_path = os.path.join(dest_dir, f"{source_name_clean}-{os.path.basename(uri.path)}") - if not os.path.exists(file_path) or not skip_download_if_file_present: - if not skip_download_if_file_present and os.path.exists(file_path): - os.remove(file_path) - wget.download(file, file_path) - logging.info( - f"{orig_source_name} file downloaded via HTTP: {file_path}" - ) - else: - logging.info( - f"{orig_source_name} file already exists, skipping download ({file_path})" - ) + download_via_http(file, file_path) else: raise AttributeError(f"Invalid URL scheme {uri.scheme}") - # if release: - # return file_path - # return os.path.dirname(file_path) - if re.search("RefSeq", source_name) and not release: - return os.path.dirname(file_path) + return os.path.dirname(file_path) if re.search("RefSeq", source_name) and not release else file_path + + def download_via_ftp(self, ftp_url: str, dest_path: str, db: str, source_name: str, skip_download: bool) -> str: + uri = urlparse(ftp_url) + + ftp = FTP(uri.netloc) + ftp.login("anonymous", "-anonymous@") + ftp.cwd(os.path.dirname(uri.path)) + remote_files = ftp.nlst() + + source_name_clean = re.sub(r"\/", "", source_name) + + for remote_file in remote_files: + # Only interested in files matching pattern + if fnmatch.fnmatch(remote_file, os.path.basename(uri.path)): + file_path = os.path.join(dest_path, os.path.basename(remote_file)) + if db == "checksum": + file_path = os.path.join(dest_path, f"{source_name_clean}-{os.path.basename(remote_file)}") + + if not (skip_download and os.path.exists(file_path)): + ftp.retrbinary("RETR " + remote_file, open(file_path, "wb").write) + logging.info(f"{source_name} file downloaded via FTP: {file_path}") + else: + logging.info(f"{source_name} file already exists, skipping download ({file_path})") + ftp.quit() + return file_path - def refseq_multithreading(self, files: List[str]) -> None: + def refseq_multithreading(self, files: List[Dict[str, str]]) -> None: """Creates multiple threads to download RefSeq files in parallel. Parameters ---------- - files: list + files: List[Dict[str, str]] The list of file URLs and paths to download. """ number_of_threads = 20 - chunk_size = int(len(files) / number_of_threads) + chunk_size = len(files) // number_of_threads threads = [] for thread_index in range(number_of_threads): @@ -397,12 +306,12 @@ def refseq_multithreading(self, files: List[str]) -> None: for thread in threads: thread.join() - def download_refseq_files(self, files: List[str], start: int, end: int) -> None: + def download_refseq_files(self, files: List[Dict[str, str]], start: int, end: int) -> None: """Downloads RefSeq files from a subset of files. Parameters ---------- - files: list + files: List[Dict[str, str]] The list of file URLs and paths to download. start: int The start index of the files list. @@ -415,20 +324,19 @@ def download_refseq_files(self, files: List[str], start: int, end: int) -> None: If file download fails all attempts. """ for index in range(start, end): - failed = 0 file_url = files[index]["url"] local_path = files[index]["path"] + source_name = files[index]["type"] - for retry in range(0, 3): + for attempt in range(3): try: wget.download(file_url, local_path) - except: - failed += 1 - continue - break - - if failed > 0: - raise BufferError(f"Failed to download file {file_url}") + logging.info(f"{source_name} file downloaded via HTTP: {local_path}") + break + except Exception as e: + logging.warning(f"Attempt {attempt + 1} failed to download {file_url}: {e}") + if attempt == 2: + raise Exception(f"Failed to download file {file_url} after 3 attempts") def get_dbi(self, url: str) -> Connection: """Returns a DB connection for a provided URL. @@ -436,117 +344,144 @@ def get_dbi(self, url: str) -> Connection: Parameters ---------- url: str - The database URL to connect to + The database URL to connect to. Returns ------- - An sqlalchemy engine connection. + Connection + An sqlalchemy engine connection. """ - connect_url = make_url(url) - engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") - + engine = self.get_db_engine(url) return engine.connect() - def get_db_engine(self, url: str) -> Engine: + def get_db_engine(self, url: str, isolation_level: str = "AUTOCOMMIT") -> Engine: """Returns a DB engine for a provided URL. Parameters ---------- url: str - The database URL to create an engine for + The database URL to create an engine for. Returns ------- - An sqlalchemy engine. + Engine + An sqlalchemy engine. """ connect_url = make_url(url) - engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") + engine = create_engine(connect_url, isolation_level=isolation_level) return engine def load_checksum(self, path: str, url: str) -> None: """Loads the xref checksum files into a provided database. - This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB. + This first combines the checksum data from different xref sources into several chunk files before loading into the DB. + These files are finally combined into one checksum.txt file. Parameters ---------- path: str - The path where the checksum files can be found + The path where the checksum files can be found. url: str - The database URL to load the checksum data into + The database URL to load the checksum data into. + + Raises + ------ + LookupError + If no source_id is found for a source name. """ checksum_dir = os.path.join(path, "Checksum") - if not os.path.exists(checksum_dir): - os.makedirs(checksum_dir, exist_ok=True) + os.makedirs(checksum_dir, exist_ok=True) output_files = [] threshold = 50000000 counter = 1 - source_id = 1 output_fh = None # Connect to db - url = url + "?local_infile=1" + url = f"{url}?local_infile=1" db_engine = self.get_db_engine(url) with db_engine.connect() as dbi: # Get all checksum files - files = os.listdir(checksum_dir) + files = [f for f in os.listdir(checksum_dir) if not re.search("checksum", f)] - # Go through all available checksum files + # Process each checksum file index = 0 for checksum_file in files: - if re.search("checksum", checksum_file): - continue - # Get the source name and ID input_file = os.path.join(checksum_dir, checksum_file) - match = re.search(r"\/([A-Za-z]*)-.*$", input_file) - source_name = match.group(1) - source_id = self.get_source_id_from_name(dbi, source_name) + source_name = re.search(r"\/([A-Za-z]*)-.*$", input_file).group(1) + source_id = self.get_source_id_from_name(source_name, dbi) + + if not source_id: + raise LookupError(f'No source_id found for source name {source_name}') # Open the input file - input_fh = self.get_filehandle(input_file) - for line in input_fh: - # Open the output file - if not output_fh or (counter % threshold) == 0: - if output_fh: - output_fh.close() - index += 1 - output_file = os.path.join( - checksum_dir, f"checksum_{index}.txt" - ) - output_files.append(output_file) - output_fh = open(output_file, "w") - - line = line.rstrip() - (checksum_id, checksum) = re.split(r"\s+", line) - - output = [str(counter), str(source_id), checksum_id, checksum] - output_str = "\t".join(output) - output_fh.write(f"{output_str}\n") - - counter += 1 - - input_fh.close() + with self.get_filehandle(input_file) as input_fh: + for line in input_fh: + # Open the output file if needed + if not output_fh or (counter % threshold) == 0: + if output_fh: + output_fh.close() + + index += 1 + output_file = os.path.join(checksum_dir, f"checksum_{index}.txt") + output_files.append(output_file) + output_fh = open(output_file, "w") + + checksum_id, checksum = re.split(r"\s+", line.rstrip()) + output_fh.write(f"{counter}\t{source_id}\t{checksum_id}\t{checksum}\n") + counter += 1 if output_fh: output_fh.close() - # Add the data in the files to the db + # Load data into the database for output_file in output_files: - dbi.execute( - text( - f"load data local infile '{output_file}' into table checksum_xref" + dbi.execute(text(f"LOAD DATA LOCAL INFILE '{output_file}' INTO TABLE checksum_xref")) + + # Merge the created files + if output_files: + merged_file = os.path.join(checksum_dir, "checksum.txt") + with open(merged_file, "w") as output_fh: + for output_file in output_files: + with open(output_file, "r") as input_fh: + shutil.copyfileobj(input_fh, output_fh) + os.remove(output_file) + + def check_file_exists(self, filename: str) -> str: + """Checks if a file exists. + Tries alternative names with .gz and .Z extensions if the original file is not found. + + Parameters + ---------- + filename: str + The file to check. + + Returns + ------- + str + Original file name if found, otherwise the first alternative name found. + + Raises + ------ + FileNotFoundError + If no file name was provided. + If provided file could not be found. + """ + if not filename: + raise FileNotFoundError("No file name provided") + + if not os.path.exists(filename): + alt_filename = re.sub(r"\.(gz|Z)$", "", filename) + if not os.path.exists(alt_filename): + alt_filename = filename + ".gz" + if not os.path.exists(alt_filename): + raise FileNotFoundError( + f"Could not find either {filename} or {alt_filename}" ) - ) + return alt_filename - # Merge the created files - merged_file = os.path.join(checksum_dir, f"checksum.txt") - with open(merged_file, "w") as output_fh: - for output_file in output_files: - with open(output_file, "r") as input_fh: - shutil.copyfileobj(input_fh, output_fh) - os.remove(output_file) + return filename def get_filehandle(self, filename: str) -> IO: """Opens an appropriate read filehandle for a file based on its type. @@ -554,11 +489,12 @@ def get_filehandle(self, filename: str) -> IO: Parameters ---------- filename: str - The name and path of the file to read + The name and path of the file to read. Returns ------- - A read filehandle. + IO + A read filehandle. Raises ------ @@ -566,41 +502,27 @@ def get_filehandle(self, filename: str) -> IO: If no file name was provided. If provided file could not be found. """ - if not filename or filename == "": - raise FileNotFoundError("No file name") - - alt_filename = filename - alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename) - if alt_filename == filename: - alt_filename = alt_filename + ".gz" - - if not os.path.exists(filename): - if not os.path.exists(alt_filename): - raise FileNotFoundError( - f"Could not find either {filename} or {alt_filename}" - ) - filename = alt_filename + filename = self.check_file_exists(filename) - if re.search(r"\.(gz|Z)$", filename): - fh = gzip.open(filename, "rt") + if filename.endswith(('.gz', '.Z')): + return gzip.open(filename, "rt") else: - fh = open(filename, "r") + return open(filename, "r") - return fh - - def get_source_id_from_name(self, dbi: Connection, source_name: str) -> int: + def get_source_id_from_name(self, source_name: str, dbi: Connection) -> int: """Retrieves a source ID from its name from a database. Parameters ---------- - dbi: db connection - The database connection to query in source_name: str - The name of the source + The name of the source. + dbi: Connection + The database connection to query in. Returns ------- - The source ID. + int + The source ID. """ source_id = dbi.execute( select(SourceSORM.source_id).where(SourceSORM.name == source_name) @@ -608,31 +530,42 @@ def get_source_id_from_name(self, dbi: Connection, source_name: str) -> int: return source_id - def get_file_sections(self, file: str, delimiter: str) -> Iterator[List[str]]: + def get_file_sections(self, filename: str, delimiter: str, encoding: str = None) -> Iterator[List[str]]: """Reads a provided file by sections, separated by a provided delimiter. This function uses 'yield' to provide the file sections one by one. Parameters ---------- file: str - The name and path of the file to read + The name and path of the file to read. delimiter: str - The character or string separating the file sections + The character or string separating the file sections. + encoding: str + The encoding of the file (default is None). Returns ------- - A yield of file sections. + Iterator[List[str]] + A generator yielding file sections as lists of strings. """ - if re.search(r"\.(gz|Z)$", file): - with gzip.open(file, "rt") as fh: - groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) - for key, group in groups: + filename = self.check_file_exists(filename) + + def read_file(fh: IO) -> Iterator[List[str]]: + groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) + for key, group in groups: + if not key: yield list(group) + + if filename.endswith(('.gz', '.Z')): + if encoding: + with gzip.open(filename, "rt", encoding=encoding, errors="replace") as fh: + yield from read_file(fh) + else: + with gzip.open(filename, "rt") as fh: + yield from read_file(fh) else: - with open(file, "r") as fh: - groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) - for key, group in groups: - yield list(group) + with open(filename, "r") as fh: + yield from read_file(fh) def create_xref_db(self, url: str, config_file: str) -> None: """Creates the xref database from model. @@ -641,50 +574,53 @@ def create_xref_db(self, url: str, config_file: str) -> None: Parameters ---------- url: str - The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] + The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]. config_file: str - The name and path of the .ini file that has information about xref sources and species + The name and path of the .ini file that has information about xref sources and species. """ engine = create_engine(url, isolation_level="AUTOCOMMIT") # Drop database and create again if database_exists(engine.url): + logging.info(f"Dropping existing database {engine.url.database}.") drop_database(engine.url) + logging.info(f"Creating new database {engine.url.database}.") create_database(engine.url) XrefUpdateDB.metadata.create_all(engine) + logging.info(f"Database {engine.url.database} created successfully.") - xref_dbi = engine.connect() - self.populate_xref_db(xref_dbi, config_file) + with engine.connect() as xref_dbi: + self.populate_xref_db(xref_dbi, config_file) + logging.info(f"Database {engine.url.database} populated successfully.") def populate_xref_db(self, dbi: Connection, config_file: str) -> None: """Populates the xref database with configuration data. Parameters ---------- - dbi: db connection - The xref database connection + dbi: Connection + The xref database connection. config_file: str - The name and path of the .ini file that has information about xref sources and species to populate the database with + The name and path of the .ini file that has information about xref sources and species to populate the database with. Raises ------ KeyError If a source exists in a species section in the configuration file, but has no source section of its own. """ - source_ids, source_parsers, species_sources = {}, {}, {} - species_sections, sources_sections = {}, {} - config = ConfigParser() config.read(config_file) - for section_name in config.sections(): - section = config[section_name] - (keyword, name) = re.split(r"\s+", section_name) + species_sections = { + name.split(" ", 1)[1]: section for name, section in config.items() if name.startswith("species") + } + sources_sections = { + name.split(" ", 1)[1]: section for name, section in config.items() if name.startswith("source") + } - if keyword == "source": - sources_sections[name] = section - elif keyword == "species": - species_sections[name] = section + species_sources = {} + source_ids = {} + source_parsers = {} # Parse species sections for species_name, section in species_sections.items(): @@ -707,10 +643,8 @@ def populate_xref_db(self, dbi: Connection, config_file: str) -> None: species_sources[species_id] = sources - source_id = 0 # Parse source sections - for source_name, section in sorted(sources_sections.items()): - source_id += 1 + for source_id, (source_name, section) in enumerate(sorted(sources_sections.items()), start=1): source_db_name = section.get("name") order = section.get("order") priority = section.get("priority") @@ -733,15 +667,11 @@ def populate_xref_db(self, dbi: Connection, config_file: str) -> None: source_ids[source_name] = source_id source_parsers[source_id] = parser - # Add source url rows + # Add source_url rows for species_id, sources in species_sources.items(): - source_names = sources.split(",") - - for source_name in source_names: - if not source_ids.get(source_name): - raise KeyError( - f"No source section found for {source_name} in config file" - ) + for source_name in sources.split(","): + if source_name not in source_ids: + raise KeyError(f"No source section found for {source_name} in config file") source_id = source_ids[source_name] parser = source_parsers[source_id] @@ -756,103 +686,98 @@ def get_source_id(self, dbi: Connection, parser: str, species_id: int, name: str Parameters ---------- - dbi: db connection - The database connection to query in + dbi: Connection + The database connection to query in. parser: str - The source parser + The source parser. species_id: int - The ID of the species related to the source + The ID of the species related to the source. name: str - The source name + The source name. division_id: int - The ID of the division related to the source + The ID of the division related to the source. Returns ------- - The source ID. + Optional[int] + The source ID or None if cannot be found. """ - name = "%" + name + "%" + name_pattern = f"%{name}%" source_id = None + # Query by parser, species_id, and name pattern query = select(SourceURLORM.source_id).where( SourceUORM.source_id == SourceURLORM.source_id, SourceURLORM.parser == parser, SourceURLORM.species_id == species_id, + SourceUORM.name.like(name_pattern), ) result = dbi.execute(query) if result.rowcount == 1: - source_id = result.scalar() - - query = ( - select(SourceURLORM.source_id) - .where( - SourceUORM.source_id == SourceURLORM.source_id, - SourceURLORM.parser == parser, - SourceURLORM.species_id == species_id, - ) - .filter(SourceUORM.name.like(name)) + return result.scalar() + + # Query by parser and species_id + query = select(SourceURLORM.source_id).where( + SourceUORM.source_id == SourceURLORM.source_id, + SourceURLORM.parser == parser, + SourceURLORM.species_id == species_id, ) result = dbi.execute(query) if result.rowcount == 1: - source_id = result.scalar() - - if not source_id: - query = ( - select(SourceURLORM.source_id) - .where( - SourceUORM.source_id == SourceURLORM.source_id, - SourceURLORM.parser == parser, - SourceURLORM.species_id == division_id, - ) - .filter(SourceUORM.name.like(name)) - ) - result = dbi.execute(query).first() - if result: - source_id = result[0] + return result.scalar() - return source_id + # Query by parser, division_id, and name pattern + query = select(SourceURLORM.source_id).where( + SourceUORM.source_id == SourceURLORM.source_id, + SourceURLORM.parser == parser, + SourceURLORM.species_id == division_id, + SourceUORM.name.like(name_pattern), + ) + result = dbi.execute(query).scalar() + if result: + return result + + return None def get_taxon_id(self, dbi: Connection) -> int: - """Retrieves the species.taxonomy_id value of the meta table in a database. + """Retrieves the species.taxonomy_id value from the meta table in a database. Parameters ---------- - dbi: db connection - The database connection to query in + dbi: Connection + The database connection to query in. Returns ------- - The taxonomy ID in the database or 1 if not found. + int + The taxonomy ID in the database or 1 if not found. """ result = dbi.execute( - select(MetaCORM.meta_value).where( - MetaCORM.meta_key == "species.taxonomy_id" - ) + select(MetaCORM.meta_value).where(MetaCORM.meta_key == "species.taxonomy_id") ) - if result.rowcount > 0: - return int(result.scalar()) - return 1 + taxon_id = result.scalar() + return int(taxon_id) if taxon_id else 1 def get_division_id(self, dbi: Connection) -> int: - """Retrives the division ID from a database based on the species.division value of the meta table. + """Retrieves the division ID from a database based on the species.division value in the meta table. Parameters ---------- - dbi: db connection - The database connection to query in + dbi: Connection + The database connection to query in. Returns ------- - The division ID in the database or 1 if not found + int + The division ID in the database or 1 if not found. """ result = dbi.execute( select(MetaCORM.meta_value).where(MetaCORM.meta_key == "species.division") ) - if result.rowcount > 0: - division = result.scalar() - + division = result.scalar() + if division: division_taxon = { "Ensembl": 7742, "EnsemblVertebrates": 7742, @@ -862,112 +787,105 @@ def get_division_id(self, dbi: Connection) -> int: "Plants": 33090, "EnsemblPlants": 33090, } - - division_id = division_taxon.get(division) - if division_id: - return int(division_id) + return division_taxon.get(division, 1) return 1 - def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str = None) -> str: - """Creates directories based on provided data. + def get_path(self, base_path: str, species: str, release: int, category: str, file_name: Optional[str] = None) -> str: + """Creates directories based on provided data and returns the full file path. Parameters ---------- base_path: str - The base file path + The base file path. species: str - The species name + The species name. release: int - The ensEMBL release number + The Ensembl release number. category: str - The file category - file_name: str, optional - The file name + The file category. + file_name: Optional[str] + The file name. Returns ------- - A file path. + str + The full file path. """ - full_path = os.path.join(base_path, species, release, category) - if not os.path.exists(full_path): - os.makedirs(full_path, exist_ok=True) + full_path = os.path.join(base_path, species, str(release), category) + os.makedirs(full_path, exist_ok=True) - if file_name: - return os.path.join(full_path, file_name) - else: - return full_path + return os.path.join(full_path, file_name) if file_name else full_path def get_db_from_registry(self, species: str, group: str, release: int, registry: str) -> Optional[str]: - """Looks up a db in the registry and returns an sqlaclehmy angine for it. + """Looks up a database in the registry and returns its URL. Parameters ---------- species: str - The species name + The species name. group: str - The db group (core, ccds, otherfeatures, etc...) + The database group (core, ccds, otherfeatures, etc.). release: int - The ensEMBL release number + The Ensembl release number. registry: str - The registry url + The registry URL. Returns ------- - A db engine or 0 if no db is found. + Optional[str] + The database URL or None if no database is found. """ - # Fix registry url, if needed - match = re.search(r"^(.*)://(.*)", registry) - if match: - registry = match.group(2) - match = re.search(r"(.*)/(.*)", registry) - if match: - registry = match.group(1) - - metasearch_url = self.param_required("metasearch_url") + # Clean up registry URL if needed + registry = re.sub(r"^(.*://)?(.*?)(/.*)?$", r"\2", registry) + + metasearch_url = self.get_param("metasearch_url", {"required": True}) metasearch_body = { "name_pattern": f"{species}_{group}%", - "filters": [ - {"meta_key": "schema_version", "meta_value": str(release)}, - ], + "filters": [{"meta_key": "schema_version", "meta_value": str(release)}], "servers": [registry], } - dbs = requests.post(metasearch_url, json=metasearch_body).json() - dbs = dbs[registry] + response = requests.post(metasearch_url, json=metasearch_body) + response.raise_for_status() + dbs = response.json().get(registry, []) - if len(dbs) > 0: - db_url = "mysql://" + dbs[0] - return db_url - else: - return None + if dbs: + return f"mysql://{dbs[0]}" + return None - def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: str = None, registry: str = None) -> BasicMapper: - """Retrives a mapper object based on species. + def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: Optional[str] = None, registry: Optional[str] = None) -> BasicMapper: + """Retrieves a mapper object based on species. Parameters ---------- xref_url: str - The xref db connection url + The xref db connection URL. species: str - The species name + The species name. base_path: str - The base file path + The base file path. release: int - The ensEMBL release number - core_db: str, optional - The species core db connection url - registry: str, optional - The registry url + The Ensembl release number. + core_url: Optional[str] + The species core db connection URL. + registry: Optional[str] + The registry URL. Returns ------- - A mapper object + BasicMapper + A mapper object. + + Raises + ------ + AttributeError + If neither core_url nor registry is provided. """ - # Need either core_db or registry + # Need either core_url or registry if not core_url and not registry: raise AttributeError( - f"Method get_xref_mapper: need to provide either a core DB URL or a registry URL" + "Method get_xref_mapper: need to provide either a core DB URL or a registry URL" ) # Create needed db connections @@ -977,15 +895,13 @@ def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: core_db = self.get_db_engine(core_url) xref_db = self.get_db_engine(xref_url) - # Extract host and dbname from xref url + # Extract host and dbname from xref URL xref_url_obj = make_url(xref_url) host = xref_url_obj.host dbname = xref_url_obj.database # Locate the fasta files - cdna_path = self.get_path( - base_path, species, release, "ensembl", "transcripts.fa" - ) + cdna_path = self.get_path(base_path, species, release, "ensembl", "transcripts.fa") pep_path = self.get_path(base_path, species, release, "ensembl", "peptides.fa") # Try to find a species-specific mapper first diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py index 7edf452e0..2d990cf70 100644 --- a/src/python/ensembl/production/xrefs/Checksum.py +++ b/src/python/ensembl/production/xrefs/Checksum.py @@ -14,14 +14,18 @@ """Checksum module for the Xref Download pipeline.""" -from ensembl.production.xrefs.Base import * +import logging +from sqlalchemy import select, func +from ensembl.xrefs.xref_source_db_model import ChecksumXref as ChecksumXrefSORM + +from ensembl.production.xrefs.Base import Base class Checksum(Base): def run(self): - base_path = self.param_required("base_path", {"type": "str"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - skip_download = self.param_required("skip_download", {"type": "bool"}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + skip_download: bool = self.get_param("skip_download", {"required": True, "type": bool}) logging.info("Checksum starting with parameters:") logging.info(f"Param: base_path = {base_path}") @@ -32,15 +36,17 @@ def run(self): db_engine = self.get_db_engine(source_db_url) # Check if checksums already exist - table_nonempty = 0 - if skip_download: - with db_engine.connect() as dbi: - query = select(func.count(ChecksumXrefSORM.checksum_xref_id)) - table_nonempty = dbi.execute(query).scalar() + table_empty = self.check_table_empty(db_engine) if skip_download else True # Load checksums from files into db - if not table_nonempty: + if table_empty: self.load_checksum(base_path, source_db_url) logging.info("Checksum data loaded") else: logging.info("Checksum data already exists, skipping loading") + + def check_table_empty(self, db_engine): + """Check if the checksum table is empty.""" + with db_engine.connect() as dbi: + query = select(func.count(ChecksumXrefSORM.checksum_xref_id)) + return dbi.execute(query).scalar() == 0 diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py index f3b9f20f4..b57407938 100644 --- a/src/python/ensembl/production/xrefs/DownloadSource.py +++ b/src/python/ensembl/production/xrefs/DownloadSource.py @@ -14,39 +14,48 @@ """Download module to download xref and version files.""" -from ensembl.production.xrefs.Base import * +import logging +from sqlalchemy import select +from sqlalchemy.dialects.mysql import insert +from typing import Optional +from ensembl.xrefs.xref_source_db_model import ( + Source as SourceSORM, + Version as VersionORM, +) + +from ensembl.production.xrefs.Base import Base class DownloadSource(Base): def run(self): - base_path = self.param_required("base_path", {"type": "str"}) - parser = self.param_required("parser", {"type": "str"}) - name = self.param_required("name", {"type": "str"}) - priority = self.param_required("priority", {"type": "int"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - file = self.param_required("file", {"type": "str"}) - skip_download = self.param_required("skip_download", {"type": "bool"}) - db = self.param("db", None, {"type": "str"}) - version_file = self.param("version_file", None, {"type": "str"}) - rel_number = self.param("rel_number", None, {"type": "str"}) - catalog = self.param("catalog", None, {"type": "str"}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + parser: str = self.get_param("parser", {"required": True, "type": str}) + name: str = self.get_param("name", {"required": True, "type": str}) + priority: int = self.get_param("priority", {"required": True, "type": int}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + file: str = self.get_param("file", {"required": True, "type": str}) + skip_download: bool = self.get_param("skip_download", {"required": True, "type": bool}) + db: Optional[str] = self.get_param("db", {"type": str}) + version_file: Optional[str] = self.get_param("version_file", {"type": str}) + rel_number: Optional[str] = self.get_param("rel_number", {"type": str}) + catalog: Optional[str] = self.get_param("catalog", {"type": str}) logging.info(f"DownloadSource starting for source {name}") # Download the main xref file - extra_args = {} - extra_args["skip_download_if_file_present"] = skip_download - extra_args["db"] = db + extra_args = { + "skip_download_if_file_present": skip_download, + "db": db + } if rel_number and catalog: - extra_args["rel_number"] = rel_number - extra_args["catalog"] = catalog - file_name = self.download_file(file, base_path, name, extra_args) + extra_args.update({"rel_number": rel_number, "catalog": catalog}) + file_path = self.download_file(file, base_path, name, extra_args) # Download the version file - version = "" + version_path = None if version_file: extra_args["release"] = "version" - version = self.download_file(version_file, base_path, name, extra_args) + version_path = self.download_file(version_file, base_path, name, extra_args) # Update source db db_engine = self.get_db_engine(source_db_url) @@ -54,20 +63,20 @@ def run(self): dbi.execute( insert(SourceSORM) .values(name=name, parser=parser) - .prefix_with("IGNORE") + .on_duplicate_key_update(parser=parser) ) - source_id = dbi.execute( select(SourceSORM.source_id).where(SourceSORM.name == name) ).scalar() + dbi.execute( insert(VersionORM) .values( source_id=source_id, - file_path=file_name, + file_path=file_path, db=db, priority=priority, - revision=version, + revision=version_path, ) - .prefix_with("IGNORE") + .on_duplicate_key_update(revision=version_path) ) diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py index 4295041a0..f574f2dc3 100644 --- a/src/python/ensembl/production/xrefs/EmailNotification.py +++ b/src/python/ensembl/production/xrefs/EmailNotification.py @@ -14,238 +14,52 @@ """Email module to send user emails notifying of xref pipelines end, with important information and statistics.""" -from ensembl.production.xrefs.Base import * - +import os +import re from smtplib import SMTP from email.message import EmailMessage +from typing import Dict, Any, Tuple +from ensembl.production.xrefs.Base import Base class EmailNotification(Base): + INDENT = "   " + def run(self): - pipeline_name = self.param_required("pipeline_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - email_address = self.param_required("email", {"type": "str"}) - email_server = self.param_required("email_server", {"type": "str"}) - log_timestamp = self.param("log_timestamp", None, {"type": "str"}) + pipeline_name: str = self.get_param("pipeline_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + email_address: str = self.get_param("email", {"required": True, "type": str}) + email_server: str = self.get_param("email_server", {"required": True, "type": str}) + log_timestamp: str = self.get_param("log_timestamp", {"type": str}) email_message = f"The {pipeline_name} has completed its run.
" - indent = "   " - if log_timestamp: # Get the path of the log files log_path = os.path.join(base_path, "logs", log_timestamp) - # Read the log file if os.path.exists(log_path): - parameters = {} + # Combine the logs into a single file + main_log_file = self.combine_logs(base_path, log_timestamp, pipeline_name) - # Copy different log files into a main one - main_log_file = self.combine_logs( - base_path, log_timestamp, pipeline_name - ) - - # Read the full logs + # Read the logs with open(main_log_file) as fh: data = fh.read() - # Extract parameter data - parameters_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data - ) - parameters = {param[0]: param[1] for param in parameters_list} - - email_message += ( - "
The pipeline was run with the following parameters:
" - ) - for param_name, param_value in parameters.items(): - if param_value == "1" or param_value == "0": - param_value = bool(param_value) - email_message += f"{param_name} = {param_value}
" + # Extract the parameters and format them + parameters = self.extract_parameters(data) + email_message += self.format_parameters(parameters) # Extract statistics data from logs if re.search("Download", pipeline_name): - sources_data, added_species, skipped_species = {}, {}, {} - - # Get sources scheduled for download - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", - data, - ) - sources_data = { - source: {"to_download": 1} for source in matches_list - } - - # Get sources scheduled for cleanup - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", - data, - ) - for source in matches_list: - sources_data[source].update({"to_cleanup": 1}) - - # Get sources cleaned up - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", - data, - ) - for source in matches_list: - sources_data[source].update({"cleaned_up": 1}) - - # Get sources with skipped download - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", - data, - ) - for source in matches_list: - sources_data[source[0]].update( - {"skipped": os.path.dirname(source[1])} - ) - - # Get sources downloaded - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", - data, - ) - for source in matches_list: - sources_data[source[0]].update( - {"downloaded": source[1] + "|" + os.path.dirname(source[2])} - ) - - # Get sources copied from local ftp - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", - data, - ) - for source in matches_list: - sources_data[source[0]].update( - {"copied": os.path.dirname(source[1])} - ) - - # Get skipped species - skipped_species_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) skipped species = (\d+)", - data, - ) - skipped_species = { - source[0]: source[1] for source in skipped_species_list - } - - # Get species with files created - added_species_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) species files created = (\d+)", - data, - ) - added_species = { - source[0]: source[1] for source in added_species_list - } - - # Add source statistics to email message - email_message += "
--Source Statistics--
" - for source_name, source_values in sources.items(): - email_message += f"{source_name}:
" - if source_values.get("to_download"): - email_message += f"{indent}Scheduled for download ✔
" - - if source_values.get("downloaded"): - (download_type, file_path) = source_values[ - "downloaded" - ].split("|") - email_message += f"{indent}File downloaded via {download_type} into {file_path}
" - elif source_values.get("copied"): - email_message += ( - indent - + "File(s) copied from local FTP into %s
" - % (source_values["copied"]) - ) - elif source_values.get("skipped"): - email_message += ( - indent - + "File(s) download skipped, already exists in %s
" - % (source_values["skipped"]) - ) - - if source_values.get("to_cleanup"): - email_message += f"{indent}Scheduled for cleanup ✔
" - if source_values.get("cleaned_up"): - email_message += f"{indent}Cleaned up ✔
" - - # Add species statistics to email message - email_message += "
--Species Statistics--
" - email_message += "Skipped Species (files already exist):
" - for source_name, count in skipped_species.items(): - email_message += f"{indent}{source_name}: {count}
" - email_message += "Added Species (files created):
" - for source_name, count in added_species.items(): - email_message += f"{indent}{source_name}: {count}
" - - email_message += "
To run the Xref Process Pipeline based on the data from this pipeline, use the same --source_db_url, and --config_file values provided to this pipeline." + sources_data, added_species, skipped_species = self.extract_download_statistics(data) + email_message += self.format_download_statistics(sources_data, added_species, skipped_species) elif re.search("Process", pipeline_name): - parsed_sources, species_counts = {}, {} - - # Get species mapped - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Mapping starting for species '([\w\/]+)'", - data, - ) - for species_name in matches_list: - species_counts[species_name] = { - "DIRECT": 0, - "INFERRED_PAIR": 0, - "MISC": 0, - "CHECKSUM": 0, - "DEPENDENT": 0, - "SEQUENCE_MATCH": 0, - } - - # Get number of xrefs added per species per source - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tLoaded (\d+) ([\w\/]+) xrefs for '([\w\/]+)'", - data, - ) - for species in matches_list: - count = int(species[0]) - xref_type = species[1] - species_name = species[2] - - prev_count = species_counts[species_name][xref_type] - count += prev_count - - species_counts[species_name][xref_type] = count - - # Get parsed sources per species - matches_list = re.findall( - r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'", - data, - ) - for species in matches_list: - source_name = species[0] - parser = species[1] - species_name = species[2] - - parsed_sources[species_name].update({source_name: parser}) - - # Add species statistics to email message - email_message += "
--Species Statistics--
" - for species_name, species_data in parsed_sources.items(): - email_message += f"{species_name}:
" - email_message += f"{indent}Sources parsed: " + ",".join(keys(species_data)) - - xref_counts = species_counts[species_name] - email_message += indent + "Xrefs added: " - for xref_type, count in xref_counts.items(): - email_message += f"{count} {xref_type} " + parsed_sources, species_counts = self.extract_process_statistics(data) + email_message += self.format_process_statistics(parsed_sources, species_counts) # Send email - message = EmailMessage() - message["Subject"] = f"{pipeline_name} Finished" - message["From"] = email_address - message["To"] = email_address - message.set_content(email_message, "html") - - smtp = SMTP(email_server) - smtp.send_message(message) + self.send_email(email_address, email_server, pipeline_name, email_message) def combine_logs(self, base_path: str, timestamp: str, type: str) -> str: ordered_processes = { @@ -277,18 +91,12 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str: "EmailNotification", ], } - log_order = ( - ordered_processes["download"] - if re.search("Download", type) - else ordered_processes["process"] - ) + log_order = ordered_processes["download"] if re.search("Download", type) else ordered_processes["process"] log_path = os.path.join(base_path, "logs", timestamp) log_files = os.listdir(log_path) - main_log_file = os.path.join( - base_path, "logs", timestamp, "logfile_" + timestamp - ) + main_log_file = os.path.join(base_path, "logs", timestamp, "logfile_" + timestamp) # Copy different log files into a main one with open(main_log_file, "a") as out_fh: @@ -297,10 +105,159 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str: matches = [s for s in log_files if re.search(pattern, s)] for log_file in matches: - log_file = os.path.join(log_path, log_file) - with open(log_file) as in_fh: - log_data = in_fh.read() - out_fh.write(log_data) - os.remove(log_file) + log_file_path = os.path.join(log_path, log_file) + with open(log_file_path) as in_fh: + out_fh.write(in_fh.read()) + os.remove(log_file_path) return main_log_file + + def extract_parameters(self, data: str) -> Dict[str, str]: + parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data) + return {param[0]: param[1] for param in parameters_list} + + def format_parameters(self, parameters: Dict[str, str]) -> str: + message = "
The pipeline was run with the following parameters:
" + for param_name, param_value in parameters.items(): + message += f"{param_name} = {param_value}
" + + return message + + def extract_download_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, str], Dict[str, str]]: + sources_data = self.extract_sources_data(data) + skipped_species = self.extract_skipped_species(data) + added_species = self.extract_added_species(data) + + return sources_data, added_species, skipped_species + + def extract_sources_data(self, data: str) -> Dict[str, Dict[str, Any]]: + sources_data = {} + + sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", "to_download")) + sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", "to_cleanup")) + sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", "cleaned_up")) + sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", "skipped", True)) + sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", "downloaded", True)) + sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", "copied", True)) + + return sources_data + + def extract_sources(self, data: str, pattern: str, key: str, split: bool = False) -> Dict[str, Dict[str, Any]]: + sources = {} + + matches_list = re.findall(pattern, data) + for match in matches_list: + if split: + if key == "skipped" or key == "copied": + val = os.path.dirname(match[1]) + else: + val = f"{match[1]}|" + os.path.dirname(match[2]) + sources[match[0]] = {key: val} + else: + sources[match] = {key: True} + + return sources + + def extract_skipped_species(self, data: str) -> Dict[str, str]: + skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) skipped species = (\d+)", data) + return {species[0]: species[1] for species in skipped_species_list} + + def extract_added_species(self, data: str) -> Dict[str, str]: + added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) species files created = (\d+)", data) + return {species[0]: species[1] for species in added_species_list} + + def format_download_statistics(self, sources_data: Dict[str, Dict[str, Any]], added_species: Dict[str, str], skipped_species: Dict[str, str]) -> str: + message = "
--Source Statistics--
" + + for source_name, source_values in sources_data.items(): + message += f"{source_name}:
" + if source_values.get("to_download"): + message += f"{self.INDENT}Scheduled for download ✔
" + if source_values.get("downloaded"): + download_type, file_path = source_values["downloaded"].split("|") + message += f"{self.INDENT}File downloaded via {download_type} into {file_path}
" + elif source_values.get("copied"): + message += f"{self.INDENT}File(s) copied from local FTP into {source_values['copied']}
" + elif source_values.get("skipped"): + message += f"{self.INDENT}File(s) download skipped, already exists in {source_values['skipped']}
" + if source_values.get("to_cleanup"): + message += f"{self.INDENT}Scheduled for cleanup ✔
" + if source_values.get("cleaned_up"): + message += f"{self.INDENT}Cleaned up ✔
" + + message += "
--Species Statistics--
" + message += "Skipped Species (files already exist):
" + for source_name, count in skipped_species.items(): + message += f"{self.INDENT}{source_name}: {count}
" + message += "Added Species (files created):
" + for source_name, count in added_species.items(): + message += f"{self.INDENT}{source_name}: {count}
" + + message += "
To run the Xref Process Pipeline based on the data from this pipeline, use the same --source_db_url, --split_files_by_species, and --config_file values provided to this pipeline." + return message + + def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, int]]]: + parsed_sources = self.extract_parsed_sources(data) + species_counts = self.extract_species_counts(data) + + return parsed_sources, species_counts + + def extract_parsed_sources(self, data: str) -> Dict[str, Dict[str, str]]: + parsed_sources = {} + + matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'", data) + for species in matches_list: + source_name, parser, species_name = species + if species_name not in parsed_sources: + parsed_sources[species_name] = {} + parsed_sources[species_name][source_name] = parser + + return parsed_sources + + def extract_species_counts(self, data: str) -> Dict[str, Dict[str, int]]: + species_counts = {} + + # Get species mapped + matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Mapping starting for species '([\w\/]+)'", data) + for species_name in matches_list: + species_counts[species_name] = { + "DIRECT": 0, + "INFERRED_PAIR": 0, + "MISC": 0, + "CHECKSUM": 0, + "DEPENDENT": 0, + "SEQUENCE_MATCH": 0, + } + + # Get number of xrefs added per species per source + matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tLoaded (\d+) ([\w\/]+) xrefs for '([\w\/]+)'", data) + for species in matches_list: + count, xref_type, species_name = int(species[0]), species[1], species[2] + species_counts[species_name][xref_type] += count + + return species_counts + + def format_process_statistics(self, parsed_sources: Dict[str, Dict[str, str]], species_counts: Dict[str, Dict[str, int]]) -> str: + message = "
--Species Statistics--
" + + for species_name, species_data in parsed_sources.items(): + message += f"{species_name}:
" + message += f"{self.INDENT}Sources parsed: " + ",".join(species_data.keys()) + "
" + + xref_counts = species_counts[species_name] + message += f"{self.INDENT}Xrefs added: " + for xref_type, count in xref_counts.items(): + message += f"{count} {xref_type} " + message += "
" + + return message + + def send_email(self, email_address: str, email_server: str, pipeline_name: str, email_message: str) -> None: + message = EmailMessage() + message["Subject"] = f"{pipeline_name} Finished" + message["From"] = email_address + message["To"] = email_address + message.set_content(email_message, "html") + + with SMTP(email_server) as smtp: + smtp.send_message(message) diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py index eeddf94e1..19388b9fb 100644 --- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py +++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py @@ -14,16 +14,26 @@ """Scheduling module to create cleanup jobs for specific xref sources.""" -from ensembl.production.xrefs.Base import * +import logging +import os +import re +from typing import Optional +from sqlalchemy import select +from ensembl.xrefs.xref_source_db_model import ( + Source as SourceSORM, + Version as VersionORM, +) + +from ensembl.production.xrefs.Base import Base class ScheduleCleanup(Base): def run(self): - base_path = self.param_required("base_path", {"type": "str"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - clean_files = self.param("clean_files", None, {"type": "bool"}) - clean_dir = self.param("clean_dir", None, {"type": "str"}) - split_files_by_species = self.param("split_files_by_species", None, {"type": "bool"}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + clean_files: Optional[bool] = self.get_param("clean_files", {"type": bool}) + clean_dir: Optional[str] = self.get_param("clean_dir", {"type": str}) + split_files_by_species: Optional[bool] = self.get_param("split_files_by_species", {"type": bool}) logging.info("ScheduleCleanup starting with parameters:") logging.info(f"Param: base_path = {base_path}") @@ -41,6 +51,7 @@ def run(self): ) sources = dbi.execute(query).mappings().all() + cleanup_sources = 0 for source in sources: # Only cleaning RefSeq and UniProt for now if not ( @@ -50,14 +61,18 @@ def run(self): continue # Remove / char from source name to access directory - clean_name = source.name - clean_name = re.sub(r"\/", "", clean_name) + clean_name = re.sub(r"\/", "", source.name) # Send parameters into cleanup jobs for each source - if os.path.exists(os.path.join(base_path, clean_name)): + source_path = os.path.join(base_path, clean_name) + if os.path.exists(source_path): + cleanup_sources += 1 logging.info(f"Source to cleanup: {source.name}") self.write_output( "cleanup_sources", {"name": source.name, "version_file": source.revision}, ) + + if cleanup_sources == 0: + self.write_output("cleanup_sources", {}) diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py index f9af93454..10b2a32af 100644 --- a/src/python/ensembl/production/xrefs/ScheduleDownload.py +++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py @@ -14,14 +14,16 @@ """Scheduling module to create download jobs for all xref sources in config file.""" -from ensembl.production.xrefs.Base import * +import json +import logging +from ensembl.production.xrefs.Base import Base class ScheduleDownload(Base): - def run(self): - config_file = self.param_required("config_file", {"type": "str"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - reuse_db = self.param_required("reuse_db", {"type": "bool"}) + def run(self) -> None: + config_file: str = self.get_param("config_file", {"required": True, "type": str}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + reuse_db: bool = self.get_param("reuse_db", {"required": True, "type": bool}) logging.info("ScheduleDownload starting with parameters:") logging.info(f"Param: config_file = {config_file}") @@ -32,12 +34,11 @@ def run(self): self.create_source_db(source_db_url, reuse_db) # Extract sources to download from config file - sources = [] with open(config_file) as conf_file: sources = json.load(conf_file) - if len(sources) < 1: - raise IOError( + if not sources: + raise ValueError( f"No sources found in config file {config_file}. Need sources to run pipeline" ) diff --git a/src/python/ensembl/production/xrefs/config/gencode_sources.json b/src/python/ensembl/production/xrefs/config/gencode_sources.json new file mode 100644 index 000000000..d9b0e2fa5 --- /dev/null +++ b/src/python/ensembl/production/xrefs/config/gencode_sources.json @@ -0,0 +1,204 @@ +[ + { + "name" : "ArrayExpress", + "parser" : "ArrayExpressParser", + "file" : "Database", + "db" : "core", + "priority" : 1 + }, + { + "name" : "CCDS", + "parser" : "CCDSParser", + "file" : "Database", + "db" : "ccds", + "priority" : 1 + }, + { + "name" : "UniParc", + "parser" : "ChecksumParser", + "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz", + "db" : "checksum", + "priority" : 1 + }, + { + "name" : "RNACentral", + "parser" : "ChecksumParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz", + "db" : "checksum", + "priority" : 1 + }, + { + "name" : "DBASS3", + "parser" : "DBASSParser", + "file" : "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv", + "priority" : 1 + }, + { + "name" : "DBASS5", + "parser" : "DBASSParser", + "file" : "https://www.dbass.soton.ac.uk/Dbass5/DownloadCsv", + "priority" : 1 + }, + { + "name" : "EntrezGene", + "parser" : "EntrezGeneParser", + "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", + "priority" : 1 + }, + { + "name" : "HPA", + "parser" : "HPAParser", + "file" : "https://www.proteinatlas.org/download/xref.php", + "priority" : 1 + }, + { + "name" : "MGI", + "parser" : "MGIParser", + "file" : "https://www.informatics.jax.org/downloads/reports/MRK_ENSEMBL.rpt", + "priority" : 2 + }, + { + "name" : "MGI_desc", + "parser" : "MGIDescParser", + "file" : "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt", + "priority" : 1 + }, + { + "name" : "MIM2GENE", + "parser" : "Mim2GeneParser", + "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen", + "priority" : 3 + }, + { + "name" : "MIM", + "parser" : "MIMParser", + "file" : "https://data.omim.org/downloads/ZpPlmgwjuTBK9T5vf2sFjA/omim.txt.gz", + "priority" : 2 + }, + { + "name" : "RFAM", + "parser" : "RFAMParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.seed.gz", + "db" : "core", + "priority" : 1 + }, + { + "name" : "Reactome", + "parser" : "ReactomeParser", + "file" : "https://www.reactome.org/download/current/Ensembl2Reactome_All_Levels.txt", + "release" : "https://www.reactome.org/ReactomeRESTfulAPI/RESTfulWS/version", + "priority" : 1 + }, + { + "name" : "Reactome", + "parser" : "ReactomeParser", + "file" : "https://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt", + "release" : "https://www.reactome.org/ReactomeRESTfulAPI/RESTfulWS/version", + "priority" : 2 + }, + { + "name" : "RefSeq_dna", + "parser" : "RefSeqParser", + "file" : "https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/*rna.gbff.gz", + "method" : "--bestn 5", + "query_cutoff" : 90, + "target_cutoff" : 90, + "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", + "priority" : 2, + "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER", + "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed" + }, + { + "name" : "RefSeq_dna", + "parser" : "RefSeqParser", + "file" : "https://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/*rna.gbff.gz", + "method" : "--bestn 5", + "query_cutoff" : 90, + "target_cutoff" : 90, + "release" : "https://ftp.ncbi.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", + "priority" : 2, + "release_number" : "https://ftp.ncbi.nih.gov/refseq/release/RELEASE_NUMBER", + "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.files.installed" + }, + { + "name" : "RefSeq_peptide", + "parser" : "RefSeqParser", + "file" : "https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/*protein.gpff.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", + "priority" : 3, + "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER", + "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed" + }, + { + "name" : "RefSeq_peptide", + "parser" : "RefSeqParser", + "file" : "https://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/*protein.gpff.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "release" : "https://ftp.ncbi.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", + "priority" : 3, + "release_number" : "https://ftp.ncbi.nih.gov/refseq/release/RELEASE_NUMBER", + "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.files.installed" + }, + { + "name" : "Refseq_import", + "parser" : "RefSeqCoordinateParser", + "file" : "Database", + "db" : "otherfeatures", + "priority" : 2 + }, + { + "name" : "UCSC_hg38", + "parser" : "UCSCParser", + "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/knownGene.txt.gz", + "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/README.txt", + "priority" : 1 + }, + { + "name" : "UCSC_mm10", + "parser" : "UCSCParser", + "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/knownGene.txt.gz", + "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/README.txt", + "priority" : 1 + }, + { + "name" : "Uniprot/SWISSPROT", + "parser" : "UniProtParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt", + "priority" : 1 + }, + { + "name" : "Uniprot/SPTREMBL", + "parser" : "UniProtParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_trembl.dat.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt", + "priority" : 1 + }, + { + "name" : "miRBase", + "parser" : "miRBaseParser", + "file" : "https://mirbase.org/download/miRNA.dat", + "method" : "--bestn 1", + "query_cutoff" : 90, + "target_cutoff" : 90, + "priority" : 1 + }, + { + "name" : "HGNC", + "parser" : "HGNCParser", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "db" : "ccds", + "priority" : 3 + } +] diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json index 929450ebf..1aa80c599 100644 --- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json +++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json @@ -9,7 +9,7 @@ { "name" : "UniParc", "parser" : "ChecksumParser", - "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis", + "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz", "db" : "checksum", "priority" : 1 }, @@ -20,30 +20,12 @@ "db" : "checksum", "priority" : 1 }, - { - "name" : "DBASS3", - "parser" : "DBASSParser", - "file" : "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv", - "priority" : 1 - }, - { - "name" : "DBASS5", - "parser" : "DBASSParser", - "file" : "https://www.dbass.soton.ac.uk/Dbass5/DownloadCsv", - "priority" : 1 - }, { "name" : "EntrezGene", "parser" : "EntrezGeneParser", "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", "priority" : 1 }, - { - "name" : "HPA", - "parser" : "HPAParser", - "file" : "https://www.proteinatlas.org/download/xref.php", - "priority" : 1 - }, { "name" : "MGI", "parser" : "MGIParser", @@ -52,28 +34,10 @@ }, { "name" : "MGI_desc", - "parser" : "MGI_Desc_Parser", + "parser" : "MGIDescParser", "file" : "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt", "priority" : 1 }, - { - "name" : "MGI_ccds", - "parser" : "MGI_CCDS_Parser", - "file" : "https://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_mouse/CCDS.current.txt", - "priority" : 2 - }, - { - "name" : "MIM2GENE", - "parser" : "Mim2GeneParser", - "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen", - "priority" : 3 - }, - { - "name" : "MIM", - "parser" : "MIMParser", - "file" : "https://data.omim.org/downloads/ZpPlmgwjuTBK9T5vf2sFjA/omim.txt.gz", - "priority" : 2 - }, { "name" : "RFAM", "parser" : "RFAMParser", @@ -81,17 +45,10 @@ "db" : "core", "priority" : 1 }, - { - "name" : "RFAM", - "parser" : "CoreXrefParser", - "file" : "script:logic_name=>rfam_12.2_gene,object_type=>gene", - "db" : "core", - "priority" : 1 - } { "name" : "RGD", "parser" : "RGDParser", - "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES.RAT.txt", + "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt", "priority" : 2 }, { @@ -110,7 +67,7 @@ }, { "name" : "RefSeq_dna", - "parser" : "RefSeqGPFFParser", + "parser" : "RefSeqParser", "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*rna.gbff.gz", "method" : "--bestn 5", "query_cutoff" : 90, @@ -122,7 +79,7 @@ }, { "name" : "RefSeq_peptide", - "parser" : "RefSeqGPFFParser", + "parser" : "RefSeqParser", "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*.protein.gpff.gz", "method" : "--bestn 1", "query_cutoff" : 100, @@ -139,13 +96,6 @@ "db" : "otherfeatures", "priority" : 2 }, - { - "name" : "UCSC_hg38", - "parser" : "UCSCParser", - "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/knownGene.txt.gz", - "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/README.txt", - "priority" : 1 - }, { "name" : "UCSC_mm10", "parser" : "UCSCParser", @@ -218,7 +168,7 @@ { "name" : "Xenbase", "parser" : "XenopusJamboreeParser", - "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt", + "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", "priority" : 1 }, { @@ -229,12 +179,5 @@ "query_cutoff" : 90, "target_cutoff" : 90, "priority" : 1 - }, - { - "name" : "HGNC", - "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", - "db" : "ccds", - "priority" : 3 } -] \ No newline at end of file +] diff --git a/src/python/ensembl/production/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini index ca3452245..6541d96ae 100644 --- a/src/python/ensembl/production/xrefs/config/xref_config.ini +++ b/src/python/ensembl/production/xrefs/config/xref_config.ini @@ -457,7 +457,7 @@ name = MGI order = 1 priority = 10 prio_descr = descriptions -parser = MGI_Desc_Parser +parser = MGIDescParser [source Reactome::MULTI] # Used by all species @@ -519,7 +519,7 @@ name = RefSeq_dna order = 15 priority = 2 prio_descr = refseq -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_dna::gencode] # Used by human and mouse @@ -527,7 +527,7 @@ name = RefSeq_dna order = 15 priority = 2 prio_descr = refseq -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_dna::MULTI-fungi] # Used by saccharomyces_cerevisiae @@ -542,7 +542,7 @@ name = RefSeq_dna order = 15 priority = 2 prio_descr = refseq -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_dna::MULTI-complete] # Used by phaeodactylum_tricornutum @@ -670,55 +670,55 @@ parser = RefSeqParser name = RefSeq_peptide order = 30 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::gencode] name = RefSeq_peptide order = 30 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::MULTI-fungi] # Used by saccharomyces_cerevisiae name = RefSeq_peptide order = 25 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::MULTI-Plants] name = RefSeq_peptide order = 25 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::MULTI-complete] # Used by phaeodactylum_tricornutum name = RefSeq_peptide order = 25 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::MULTI-protozoa] # Used by dictyostelium_discoideum name = RefSeq_peptide order = 25 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::MULTI-invertebrate] # Used by caenorhabditis_elegans, ciona_savignyi, drosophila_melanogaster name = RefSeq_peptide order = 25 priority = 2 -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide_predicted::MULTI] -# Special source used in RefSeqGPFFParser. No species uses this source. +# Special source used in RefSeqParser. No species uses this source. name = RefSeq_peptide_predicted order = 30 priority = 2 prio_descr = refseq -parser = RefSeqGPFFParser +parser = RefSeqParser [source RefSeq_peptide::MULTI-vertebrate] # Used by vertebrates @@ -726,7 +726,7 @@ name = RefSeq_peptide order = 25 priority = 2 prio_descr = refseq -parser = RefSeqGPFFParser +parser = RefSeqParser [source SGD_GENE::saccharomyces_cerevisiae] # Used by saccharomyces_cerevisiae From 571d04e17e5ee2044330c84ef08ca94d31d2f45a Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 2 Dec 2024 14:31:41 +0000 Subject: [PATCH 05/12] Download Pipeline fixes --- nextflow/config/xref.config | 30 +++++-- nextflow/workflows/xrefDownload.nf | 24 +++--- scripts/xrefs/cleanup_and_split_source.pl | 2 +- scripts/xrefs/run_process.pl | 84 ++++++++++++++++++ .../production/xrefs/EmailNotification.py | 85 ++++++++++++------- .../xrefs/config/xref_all_sources.json | 2 +- src/python/scripts/run_module.py | 2 +- 7 files changed, 174 insertions(+), 55 deletions(-) create mode 100644 scripts/xrefs/run_process.pl diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config index a7cef685e..2518e806e 100644 --- a/nextflow/config/xref.config +++ b/nextflow/config/xref.config @@ -14,7 +14,6 @@ params.sources_config_file = "${params.work_dir}/ensembl-production/src/python/e params.source_db_url = '' params.skip_download = 0 params.reuse_db = 0 -params.skip_preparse = 1 params.split_files_by_species = 1 params.tax_ids_file = '' params.update_mode = 0 @@ -23,6 +22,16 @@ params.base_path = '' params.clean_files = 1 params.clean_dir = "${params.base_path}/clean_files" +params.species = '' +params.antispecies = '' +params.division = '' +params.run_all = 0 + +params.history_file = '' +params.dc_config_file = '' +params.old_server_uri = '' +params.registry_file = '' + trace { enabled = true file = "trace" @@ -38,19 +47,20 @@ report { profiles { slurm { process { - errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } executor = 'slurm' queue = 'production' queueSize = 300 - maxRetries = 2 time = '1d' memory = 100.MB + errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } + maxRetries = 2 + withLabel:small_process { memory = 200.MB } - withLabel: dm { + withLabel:dm { queue = 'datamover' memory = 2.GB } @@ -59,8 +69,11 @@ profiles { memory = 1.GB } - withLabel:mem4GB { + withLabel:cleanup_mem { memory = 4.GB + errorStrategy = 'retry' + maxRetries = 0 + time = '7d' } withLabel:align_mem { @@ -68,8 +81,11 @@ profiles { maxRetries = 5 memory = { task.attempt <= 5 ? 4.GB * (task.attempt * task.attempt) : 16.GB } } + + withLabel:mapping_mem { + memory = 4.GB + maxRetries = 0 + } } } } - - diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf index e87458735..8034627ed 100644 --- a/nextflow/workflows/xrefDownload.nf +++ b/nextflow/workflows/xrefDownload.nf @@ -4,6 +4,12 @@ params.pipeline_name = 'Xref Download Pipeline' params.help = false +// Ensure all paths are absolute +params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString() +params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString() +params.base_path = file(params.base_path).toAbsolutePath().toString() +params.clean_dir = file(params.clean_dir).toAbsolutePath().toString() + println """\ XREF DOWNLOAD PIPELINE ====================== @@ -11,11 +17,9 @@ println """\ base_path : ${params.base_path} reuse_db : ${params.reuse_db} skip_download : ${params.skip_download} - skip_preparse : ${params.skip_preparse} clean_files : ${params.clean_files} split_files_by_species : ${params.split_files_by_species} config_file : ${params.config_file} - sources_config_file : ${params.sources_config_file} clean_dir : ${params.clean_dir} tax_ids_file : ${params.tax_ids_file} update_mode : ${params.update_mode} @@ -38,9 +42,6 @@ def helpMessage() { --skip_download (optional) If set to 1, source files will only be downloaded if they don't already exist in --base_path. Default: 0 - --skip_preparse (optional) If set to 1, the pre-parse step will be skipped (no central DB). - Default: 1 - --clean_files (optional) If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files. Default: 1 @@ -50,9 +51,6 @@ def helpMessage() { --config_file (optional) Path to the json file containing information about xref sources to download. Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json - --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. - Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini - --clean_dir (optional) Path where to save the cleaned up files. Default: [--base_path]/clean_files @@ -111,7 +109,7 @@ process ScheduleDownload { timestamp = new java.util.Date().format("yyyyMMdd_HHmmss") """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --base_path ${params.base_path} --log_timestamp $timestamp """ } @@ -144,7 +142,7 @@ process CleanupTmpFiles { val 'TmpCleanupDone' """ - find ${params.base_path} -type f -name "*.tmp" -delete + find ${params.base_path} -path "${params.clean_dir}" -prune -o -type f -name "*.tmp" -exec rm -f {} + """ } @@ -180,7 +178,7 @@ process Checksum { } process CleanupSplitSource { - label 'mem4GB' + label 'cleanup_mem' tag "$src_name" input: @@ -207,7 +205,7 @@ process CleanupSplitSource { } process CleanupSource { - label 'mem4GB' + label 'cleanup_mem' tag "$src_name" input: @@ -240,4 +238,4 @@ process NotifyByEmail { """ python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp """ -} +} \ No newline at end of file diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index f1ea08be0..0b956a31d 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -210,7 +210,7 @@ my $species_id_str = sprintf("%04d", $species_id); my @digits = split('', $species_id_str); - $write_path = catdir($output_path, @digits); + $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]); make_path($write_path); $write_file = catfile($write_path, "$output_file_name-$species_id"); diff --git a/scripts/xrefs/run_process.pl b/scripts/xrefs/run_process.pl new file mode 100644 index 000000000..0fd396aeb --- /dev/null +++ b/scripts/xrefs/run_process.pl @@ -0,0 +1,84 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Data::Dumper; +use Carp; +use Module::Load; +use JSON; + +# List of param names that should be treated as arrays +# TO DO: make this somehow more generic +my $array_params = { + 'analysis_types' => 1, 'datacheck_groups' => 1 +}; + +# Parse the command line parameters sent to the script +my $params = parse_options(); + +if (!defined($params->{'class'})) { + confess "--ERROR-- perl class not defined."; +} + +# Create the module object and initialize it +my $class = $params->{'class'}; +eval("use $class;"); + +my $runnable = $class->new($params); + +# Run the job life cycle +$runnable->fetch_input(); +$runnable->run(); +$runnable->write_output(); + +sub parse_options { + my $params; + my %hash; + + foreach my $option (@ARGV) { + next if ($option !~ /^-/); + + $option =~ s/^-//g; + my @tmp = split("=", $option, 2); + + if ($tmp[0] eq 'dataflow') { + my $decoded_dataflow = decode_json($tmp[1]); + while (my ($dt_key, $dt_val) = each %{$decoded_dataflow}) { + if ($dt_val && ($dt_val =~ /,/ || $array_params->{$dt_key})) { + my @values_array = split(",", $dt_val); + $params->{$dt_key} = \@values_array; + } else { + $params->{$dt_key} = $dt_val; + } + } + next; + } + + if ($tmp[1] && ($tmp[1] =~ /,/ || $array_params->{$tmp[0]})) { + my @values_array = split(",", $tmp[1]); + $params->{$tmp[0]} = \@values_array; + } else { + $params->{$tmp[0]} = $tmp[1] + } + } + + return $params; +} + +__DATA__ +=pod +=head1 NAME +run_process.pl +=head1 SYNOPSIS + run_process.pl -class= [] +=head1 DESCRIPTION +run_process.pl is a generic script that is used to call runnables from a Nextflow .nf file. This script initializes the module object and runs the life cycle of that module: fetch_input(), run(), and write_output() +=head1 LICENSE + Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute + Copyright [2016-2022] EMBL-European Bioinformatics Institute + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and limitations under the License. +=cut diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py index f574f2dc3..932b0c1b7 100644 --- a/src/python/ensembl/production/xrefs/EmailNotification.py +++ b/src/python/ensembl/production/xrefs/EmailNotification.py @@ -32,7 +32,10 @@ def run(self): email_server: str = self.get_param("email_server", {"required": True, "type": str}) log_timestamp: str = self.get_param("log_timestamp", {"type": str}) - email_message = f"The {pipeline_name} has completed its run.
" + email_message = f"

The {pipeline_name} has completed its run.
" + if re.search("Download", pipeline_name): + email_message += "To run the Xref Process Pipeline based on the data from this pipeline, use the same --source_db_url, --split_files_by_species, and --config_file values provided to this pipeline." + email_message += "

" if log_timestamp: # Get the path of the log files @@ -67,6 +70,7 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str: "ScheduleDownload", "DownloadSource", "ScheduleCleanup", + "Checksum", "Cleanup(.*)Source", "EmailNotification", ], @@ -117,7 +121,7 @@ def extract_parameters(self, data: str) -> Dict[str, str]: return {param[0]: param[1] for param in parameters_list} def format_parameters(self, parameters: Dict[str, str]) -> str: - message = "
The pipeline was run with the following parameters:
" + message = "
Run Parameters
" for param_name, param_value in parameters.items(): message += f"{param_name} = {param_value}
" @@ -133,12 +137,31 @@ def extract_download_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, An def extract_sources_data(self, data: str) -> Dict[str, Dict[str, Any]]: sources_data = {} - sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", "to_download")) - sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", "to_cleanup")) - sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", "cleaned_up")) - sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", "skipped", True)) - sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", "downloaded", True)) - sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", "copied", True)) + # Helper function to update sources_data + def update_sources_data(new_data: Dict[str, Dict[str, Any]]): + for key, value in new_data.items(): + if key in sources_data: + sources_data[key].update(value) + else: + sources_data[key] = value + + # Get sources set to be downloaded + update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", "to_download")) + + # Get sources set to be cleaned up + update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", "to_cleanup")) + + # Get sources cleaned up + update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", "cleaned_up")) + + # Get sources skipped + update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", "skipped", True)) + + # Get sources downloaded + update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", "downloaded", True)) + + # Get sources copied + update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", "copied", True)) return sources_data @@ -151,7 +174,7 @@ def extract_sources(self, data: str, pattern: str, key: str, split: bool = False if key == "skipped" or key == "copied": val = os.path.dirname(match[1]) else: - val = f"{match[1]}|" + os.path.dirname(match[2]) + val = os.path.dirname(match[2]) sources[match[0]] = {key: val} else: sources[match] = {key: True} @@ -167,33 +190,31 @@ def extract_added_species(self, data: str) -> Dict[str, str]: return {species[0]: species[1] for species in added_species_list} def format_download_statistics(self, sources_data: Dict[str, Dict[str, Any]], added_species: Dict[str, str], skipped_species: Dict[str, str]) -> str: - message = "
--Source Statistics--
" + cell_style = 'style="border-right: 1px solid #000; padding: 5px;"' + message = "
Source Statistics
" + message += f"" + message += f"" for source_name, source_values in sources_data.items(): - message += f"{source_name}:
" - if source_values.get("to_download"): - message += f"{self.INDENT}Scheduled for download ✔
" - if source_values.get("downloaded"): - download_type, file_path = source_values["downloaded"].split("|") - message += f"{self.INDENT}File downloaded via {download_type} into {file_path}
" - elif source_values.get("copied"): - message += f"{self.INDENT}File(s) copied from local FTP into {source_values['copied']}
" - elif source_values.get("skipped"): - message += f"{self.INDENT}File(s) download skipped, already exists in {source_values['skipped']}
" - if source_values.get("to_cleanup"): - message += f"{self.INDENT}Scheduled for cleanup ✔
" - if source_values.get("cleaned_up"): - message += f"{self.INDENT}Cleaned up ✔
" - - message += "
--Species Statistics--
" - message += "Skipped Species (files already exist):
" - for source_name, count in skipped_species.items(): - message += f"{self.INDENT}{source_name}: {count}
" - message += "Added Species (files created):
" + message += f"" + message += f"" if source_values.get("to_download") else f"" + message += f"" if source_values.get("downloaded") or source_values.get("copied") else f"" + message += f"" if source_values.get("skipped") else f"" + message += f"" if source_values.get("to_cleanup") else f"" + message += f"" + message += "" + message += "
SourceScheduledDownloadedDownload SkippedCleaned-upLocation
{source_name}XXXX{source_values.get('downloaded', source_values.get('copied', source_values.get('skipped', '')))}
" + + message += "
Species Statistics
" + message += "Added Species (files created):
    " for source_name, count in added_species.items(): - message += f"{self.INDENT}{source_name}: {count}
    " + message += f"
  • {source_name}: {count}
  • " + message += "
" + message += "Skipped Species (files already exist):
    " + for source_name, count in skipped_species.items(): + message += f"
  • {source_name}: {count}
  • " + message += "
" - message += "
To run the Xref Process Pipeline based on the data from this pipeline, use the same --source_db_url, --split_files_by_species, and --config_file values provided to this pipeline." return message def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, int]]]: diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json index 1aa80c599..006d8ce71 100644 --- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json +++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json @@ -168,7 +168,7 @@ { "name" : "Xenbase", "parser" : "XenopusJamboreeParser", - "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", + "file" : "https://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", "priority" : 1 }, { diff --git a/src/python/scripts/run_module.py b/src/python/scripts/run_module.py index 874f02dd8..3b7f32cef 100644 --- a/src/python/scripts/run_module.py +++ b/src/python/scripts/run_module.py @@ -21,7 +21,7 @@ def main(): params = Params() - module_name = params.param_required('module') + module_name = params.get_param('module', {"required": True}) class_name = module_name.split(".")[-1] module = importlib.import_module(module_name) From ccf167de47ff9373e8469760379c22e6be015ab5 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Tue, 3 Dec 2024 09:45:34 +0000 Subject: [PATCH 06/12] New xref processing pipleine --- nextflow/workflows/xrefProcess.nf | 7 +- scripts/xrefs/coordinate_mapper.pl | 101 +-- scripts/xrefs/dump_ensembl.pl | 26 +- scripts/xrefs/refseq_coordinate_parser.pl | 87 +- .../production/xrefs/AdvisoryXrefReport.py | 13 +- .../ensembl/production/xrefs/Alignment.py | 71 +- .../production/xrefs/CoordinateMapping.py | 29 +- .../ensembl/production/xrefs/DirectXrefs.py | 22 +- .../ensembl/production/xrefs/DumpEnsembl.py | 69 +- .../ensembl/production/xrefs/DumpXref.py | 97 +- .../xrefs/EmailAdvisoryXrefReport.py | 66 +- .../ensembl/production/xrefs/Mapping.py | 23 +- .../ensembl/production/xrefs/ParseSource.py | 46 +- .../production/xrefs/ProcessAlignment.py | 21 +- .../production/xrefs/RNACentralMapping.py | 29 +- .../production/xrefs/ScheduleAlignment.py | 37 +- .../production/xrefs/ScheduleMapping.py | 21 +- .../ensembl/production/xrefs/ScheduleParse.py | 124 +-- .../production/xrefs/ScheduleSpecies.py | 177 ++-- .../production/xrefs/UniParcMapping.py | 29 +- .../production/xrefs/mappers/BasicMapper.py | 209 ++--- .../xrefs/mappers/ChecksumMapper.py | 22 +- .../xrefs/mappers/CoordinateMapper.py | 162 ++-- .../production/xrefs/mappers/CoreInfo.py | 370 ++++---- .../xrefs/mappers/DirectXrefsMapper.py | 36 +- .../production/xrefs/mappers/DisplayXrefs.py | 362 +++----- .../xrefs/mappers/OfficialNaming.py | 227 +++-- .../xrefs/mappers/ProcessMappings.py | 486 +++++----- .../production/xrefs/mappers/ProcessMoves.py | 497 +++++------ .../production/xrefs/mappers/ProcessPaired.py | 47 +- .../xrefs/mappers/ProcessPriorities.py | 46 +- .../xrefs/mappers/RNACentralMapper.py | 3 +- .../production/xrefs/mappers/TestMappings.py | 108 ++- .../production/xrefs/mappers/UniParcMapper.py | 3 +- .../production/xrefs/mappers/XrefLoader.py | 843 +++++++++--------- .../xrefs/mappers/methods/ChecksumBasic.py | 33 +- .../xrefs/mappers/methods/MySQLChecksum.py | 23 +- .../xrefs/mappers/species/aedes_aegypti.py | 4 +- .../mappers/species/anopheles_gambiae.py | 4 +- .../mappers/species/culex_quinquefasciatus.py | 4 +- .../xrefs/mappers/species/danio_rerio.py | 3 +- .../xrefs/mappers/species/drosophila.py | 4 +- .../xrefs/mappers/species/eukaryota.py | 25 +- .../xrefs/mappers/species/homo_sapiens.py | 4 +- .../mappers/species/ixodes_scapularis.py | 4 +- .../xrefs/mappers/species/mus_musculus.py | 4 +- .../mappers/species/neurospora_crassa.py | 4 +- .../xrefs/mappers/species/parasite.py | 3 +- .../mappers/species/rattus_norvegicus.py | 4 +- .../species/saccharomyces_cerevisiae.py | 4 +- .../xrefs/mappers/species/sars_cov_2.py | 14 +- .../species/schizosaccharomyces_pombe.py | 4 +- .../xrefs/mappers/species/sus_scrofa.py | 4 +- .../xrefs/mappers/species/wormbase.py | 14 +- .../xrefs/parsers/ArrayExpressParser.py | 14 +- .../production/xrefs/parsers/BaseParser.py | 51 +- .../production/xrefs/parsers/CCDSParser.py | 6 +- .../xrefs/parsers/EntrezGeneParser.py | 8 +- .../production/xrefs/parsers/HGNCParser.py | 10 +- .../xrefs/parsers/JGI_ProteinParser.py | 2 +- .../xrefs/parsers/Mim2GeneParser.py | 6 +- .../production/xrefs/parsers/RFAMParser.py | 12 +- .../production/xrefs/parsers/RGDParser.py | 2 +- .../xrefs/parsers/ReactomeParser.py | 2 +- .../xrefs/parsers/RefSeqCoordinateParser.py | 21 +- .../production/xrefs/parsers/RefSeqParser.py | 10 +- .../production/xrefs/parsers/UniProtParser.py | 4 +- .../production/xrefs/parsers/ZFINParser.py | 6 +- .../production/xrefs/parsers/miRBaseParser.py | 2 +- 69 files changed, 2356 insertions(+), 2479 deletions(-) diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf index 02517aa60..8ae1d8c19 100644 --- a/nextflow/workflows/xrefProcess.nf +++ b/nextflow/workflows/xrefProcess.nf @@ -4,6 +4,11 @@ params.pipeline_name = 'Xref Process Pipeline' params.help = false +// Ensure all paths are absolute +params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString() +params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString() +params.base_path = file(params.base_path).toAbsolutePath().toString() + println """\ XREF PROCESS PIPELINE ====================== @@ -622,6 +627,6 @@ process NotifyByEmail { val timestamp """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --release ${params.release} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp """ } \ No newline at end of file diff --git a/scripts/xrefs/coordinate_mapper.pl b/scripts/xrefs/coordinate_mapper.pl index 76c06775f..43fd99af8 100644 --- a/scripts/xrefs/coordinate_mapper.pl +++ b/scripts/xrefs/coordinate_mapper.pl @@ -20,9 +20,10 @@ use DBI; use JSON; use Getopt::Long; +use File::Spec::Functions qw(catfile); use Nextflow::Utils; -use Bio::EnsEMBL::DBSQL::DBAdaptor +use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::Mapper::RangeRegistry; my ($xref_db_url, $core_db_url, $species_id, $output_dir, $analysis_id); @@ -35,8 +36,8 @@ ); # Check that all parameters are passed -if (!defined($xref_db_url) || !defined($core_db_url) || !defined($species_id) || !defined($output_dir) || !defined($analysis_id)) { - croak "Usage: dump_ensembl.pl --xref_db_url --core_db_url --species_id --output_dir --analysis_id "; +foreach my $param ($xref_db_url, $core_db_url, $species_id, $output_dir, $analysis_id) { + defined $param or croak "Usage: dump_ensembl.pl --xref_db_url --core_db_url --species_id --output_dir --analysis_id "; } # Set the files to use @@ -45,27 +46,30 @@ my $unmapped_reason_filename = catfile($output_dir, 'unmapped_reason_coord.txt'); my $unmapped_object_filename = catfile($output_dir, 'unmapped_object_coord.txt'); -# Connect tp dbs -my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url); +# Connect to dbs +my ($core_host, $core_port, $core_user, $core_pass, $core_dbname) = parse_url($core_db_url); my $core_dbi = get_dbi($core_host, $core_port, $core_user, $core_pass, $core_dbname); my $xref_dbi = get_dbi(parse_url($xref_db_url)); # Figure out the last used IDs in the core DB -my $xref_id = $core_dbi->selectall_arrayref('SELECT MAX(xref_id) FROM xref')->[0][0]; -my $object_xref_id = $core_dbi->selectall_arrayref('SELECT MAX(object_xref_id) FROM object_xref')->[0][0]; -my $unmapped_object_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_object_id) FROM unmapped_object')->[0][0]; -my $unmapped_reason_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_reason_id) FROM unmapped_reason')->[0][0]; +my $xref_id = $core_dbi->selectrow_array('SELECT MAX(xref_id) FROM xref') || 0; +my $object_xref_id = $core_dbi->selectrow_array('SELECT MAX(object_xref_id) FROM object_xref') || 0; +my $unmapped_object_id = $core_dbi->selectrow_array('SELECT MAX(unmapped_object_id) FROM unmapped_object') || 0; +my $unmapped_reason_id = $core_dbi->selectrow_array('SELECT MAX(unmapped_reason_id) FROM unmapped_reason') || 0; my (%unmapped, %mapped); my $external_db_id; # Read and store available Xrefs from the Xref database -my $xref_sth = $xref_dbi->prepare("SELECT c.coord_xref_id,s.name,c.accession FROM coordinate_xref c,source s WHERE c.source_id=s.source_id AND c.species_id=?"); +my $xref_sth = $xref_dbi->prepare("SELECT c.coord_xref_id, s.name, c.accession FROM coordinate_xref c JOIN source s ON c.source_id = s.source_id WHERE c.species_id = ?"); $xref_sth->bind_param(1, $species_id, SQL_INTEGER); $xref_sth->execute(); while (my $xref = $xref_sth->fetchrow_hashref()) { - $external_db_id ||= $core_dbi->selectall_arrayref('SELECT external_db_id FROM external_db WHERE db_name='.$xref->{'name'})->[0][0]; + my $sth_external_db = $core_dbi->prepare('SELECT external_db_id FROM external_db WHERE db_name = ?'); + $sth_external_db->execute($xref->{'name'}); + $external_db_id ||= ($sth_external_db->fetchrow_array())[0]; + $sth_external_db->finish(); $external_db_id ||= 11000; # FIXME (11000 is 'UCSC') $unmapped{$xref->{'coord_xref_id'}} = { @@ -77,16 +81,14 @@ } $xref_sth->finish(); -if (!defined($external_db_id)) { - die "External_db_id is undefined for species_id = $species_id\n"; -} +defined $external_db_id or die "External_db_id is undefined for species_id = $species_id\n"; # Start the coordinate matching my $core_db_adaptor = Bio::EnsEMBL::DBSQL::DBAdaptor->new( - -host => $core_host, - -port => $core_port, - -user => $core_user, - -pass => $core_pass, + -host => $core_host, + -port => $core_port, + -user => $core_user, + -pass => $core_pass, -dbname => $core_dbname, ); @@ -111,7 +113,7 @@ my $chr_name = $chromosome->seq_region_name(); my @genes = @{ $chromosome->get_all_Genes( undef, undef, 1 ) }; - while (my $gene = shift(@genes)) { + foreach my $gene (@genes) { my @transcripts = @{ $gene->get_all_Transcripts() }; my %gene_result; @@ -144,12 +146,7 @@ # '$rr1' is the RangeRegistry holding Ensembl exons for one transcript at a time. my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new(); - my $coding_transcript; - if (defined($transcript->translation())) { - $coding_transcript = 1; - } else { - $coding_transcript = 0; - } + my $coding_transcript = defined($transcript->translation()) ? 1 : 0; foreach my $exon (@exons) { # Register each exon in the RangeRegistry. Register both the @@ -198,7 +195,7 @@ for (my $i = 0 ; $i < $exonCount ; ++$i) { # Register the exons from the external database in the same - # was as with the Ensembl exons, and calculate the overlap + # way as with the Ensembl exons, and calculate the overlap # of the external exons with the previously registered # Ensembl exons. @@ -206,9 +203,7 @@ $exon_match += $overlap/($exonEnds[$i] - $exonStarts[$i] + 1); $rr2->check_and_register('exon', $exonStarts[$i], $exonEnds[$i]); - if (!defined($cdsStart) || !defined($cdsEnd)) { - # Non-coding transcript. - } else { + if (defined($cdsStart) && defined($cdsEnd)) { my $codingStart = ($exonStarts[$i] > $cdsStart ? $exonStarts[$i] : $cdsStart); my $codingEnd = ($exonEnds[$i] < $cdsEnd ? $exonEnds[$i] : $cdsEnd); @@ -255,7 +250,7 @@ $coding_weight*($coding_count + $ens_weight*$rcoding_count) ); - if (!defined( $transcript_result{$coord_xref_id}) || $transcript_result{$coord_xref_id} < $score) { + if (!defined($transcript_result{$coord_xref_id}) || $transcript_result{$coord_xref_id} < $score) { $transcript_result{$coord_xref_id} = $score; } @@ -266,16 +261,16 @@ # this transcript. my $best_score; - foreach my $coord_xref_id (sort( { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result) )) { + foreach my $coord_xref_id (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) { my $score = $transcript_result{$coord_xref_id}; if ($score > $transcript_score_threshold) { $best_score ||= $score; if (sprintf("%.3f", $score) eq sprintf("%.3f", $best_score)) { - if (exists( $unmapped{$coord_xref_id})) { + if (exists($unmapped{$coord_xref_id})) { $mapped{$coord_xref_id} = $unmapped{$coord_xref_id}; - delete( $unmapped{$coord_xref_id} ); + delete($unmapped{$coord_xref_id}); $mapped{$coord_xref_id}{'reason'} = undef; $mapped{$coord_xref_id}{'reason_full'} = undef; $mapped{$coord_xref_id}{'chr_name'} = $chr_name; @@ -287,21 +282,21 @@ }); # This is now a candidate Xref for the gene. - if (!defined( $gene_result{$coord_xref_id}) || $gene_result{$coord_xref_id} < $score) { + if (!defined($gene_result{$coord_xref_id}) || $gene_result{$coord_xref_id} < $score) { $gene_result{$coord_xref_id} = $score; } } elsif (exists($unmapped{$coord_xref_id})) { $unmapped{$coord_xref_id}{'reason'} = 'Was not best match'; $unmapped{$coord_xref_id}{'reason_full'} = sprintf("Did not top best transcript match score (%.2f)", $best_score); - if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) { + if (!defined($unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) { $unmapped{$coord_xref_id}{'score'} = $score; $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID(); } } - } elsif (exists( $unmapped{$coord_xref_id}) && $unmapped{$coord_xref_id}{'reason'} ne 'Was not best match') { + } elsif (exists($unmapped{$coord_xref_id}) && $unmapped{$coord_xref_id}{'reason'} ne 'Was not best match') { $unmapped{$coord_xref_id}{'reason'} = 'Did not meet threshold'; - $unmapped{$coord_xref_id}{'reason_full'} = sprintf( "Match score for transcript lower than threshold (%.2f)", $transcript_score_threshold); - if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) { + $unmapped{$coord_xref_id}{'reason_full'} = sprintf("Match score for transcript lower than threshold (%.2f)", $transcript_score_threshold); + if (!defined($unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) { $unmapped{$coord_xref_id}{'score'} = $score; $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID(); } @@ -325,35 +320,21 @@ sub parse_url { my ($url) = @_; - my $parsed_url = Nextflow::Utils::parse($url); - my $user = $parsed_url->{'user'}; - my $pass = $parsed_url->{'pass'}; - my $host = $parsed_url->{'host'}; - my $port = $parsed_url->{'port'}; - my $db = $parsed_url->{'dbname'}; - - return ($host, $port, $user, $pass, $db) + return @{$parsed_url}{qw(host port user pass dbname)}; } sub get_dbi { my ($host, $port, $user, $pass, $dbname) = @_; - - my $dbconn; - if (defined $dbname) { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); - } else { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); - } - my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); - + my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr); return $dbi; } sub dump_xref { my ($filename, $xref_id, $mapped, $unmapped) = @_; - my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename)); foreach my $xref (values(%{$unmapped}), values(%{$mapped})) { # Assign 'xref_id' to this Xref. @@ -382,7 +363,7 @@ sub dump_xref { sub dump_object_xref { my ($filename, $object_xref_id, $analysis_id, $mapped) = @_; - my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename)); foreach my $xref (values(%{$mapped})) { foreach my $object_xref (@{ $xref->{'mapped_to'} }) { @@ -417,7 +398,7 @@ sub dump_unmapped_reason { } } - my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename)); my $sth = $core_dbi->prepare('SELECT unmapped_reason_id FROM unmapped_reason WHERE full_description = ?'); @@ -457,7 +438,7 @@ sub dump_unmapped_reason { sub dump_unmapped_object { my ($filename, $unmapped_object_id, $analysis_id, $unmapped) = @_; - my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename)); + my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename)); foreach my $xref (values(%{$unmapped})) { # Assign 'unmapped_object_id' to this Xref. @@ -523,7 +504,7 @@ sub upload_data { my $load_sql = sprintf("LOAD DATA LOCAL INFILE ? REPLACE INTO TABLE %s", $table_name); - my $rows = $dbi->do($cleanup_sql, undef, $external_db_id) or croak($dbi->strerr()); + my $rows = $dbi->do($cleanup_sql, undef, $external_db_id) or croak($dbi->errstr()); $rows = $dbi->do($load_sql, undef, $filename) or croak($dbi->errstr()); diff --git a/scripts/xrefs/dump_ensembl.pl b/scripts/xrefs/dump_ensembl.pl index 22132195d..1e035090d 100644 --- a/scripts/xrefs/dump_ensembl.pl +++ b/scripts/xrefs/dump_ensembl.pl @@ -34,21 +34,21 @@ ); # Check that all parameters are passed -if (!defined($cdna_path) || !defined($pep_path) || !defined($species) || !defined($core_db_url) || !defined($release)) { - croak "Usage: dump_ensembl.pl --cdna_path --pep_path --species --core_db_url --release "; +foreach my $param ($cdna_path, $pep_path, $species, $core_db_url, $release) { + defined $param or croak "Usage: dump_ensembl.pl --cdna_path --pep_path --species --core_db_url --release "; } # Open fasta files for writing -my $cdna_fh = IO::File->new($cdna_path ,'w') || throw("Cannot create filehandle $cdna_path"); +my $cdna_fh = IO::File->new($cdna_path, 'w') or croak "Cannot create filehandle $cdna_path"; my $cdna_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($cdna_fh); -my $pep_fh = IO::File->new($pep_path ,'w') || throw("Cannot create filehandle $pep_path"); +my $pep_fh = IO::File->new($pep_path, 'w') or croak "Cannot create filehandle $pep_path"; my $pep_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($pep_fh); # Load the registry -my ($user, $pass, $host, $port, $dbname) = parse_url($core_db_url); +my ($host, $port, $user, $pass, $dbname) = parse_url($core_db_url); my $registry = 'Bio::EnsEMBL::Registry'; my %registry_params = (-HOST => $host, -PORT => $port, -USER => $user, -DB_VERSION => $release); -$registry_params{-PASS} = $pass if ($pass); +$registry_params{-PASS} = $pass if $pass; $registry->load_registry_from_db(%registry_params); # Get transcripts @@ -56,14 +56,13 @@ my $transcript_list = $transcript_adaptor->fetch_all(); # Dump sequence data -while (my $transcript = shift @$transcript_list) { +foreach my $transcript (@$transcript_list) { my $sequence = $transcript->seq(); $sequence->id($transcript->dbID()); $cdna_writer->print_Seq($sequence); # Get and dump translation data - my $translation = $transcript->translation; - if ($translation) { + if (my $translation = $transcript->translation) { $sequence = $transcript->translate; $sequence->id($translation->dbID()); $pep_writer->print_Seq($sequence); @@ -77,10 +76,5 @@ sub parse_url { my ($url) = @_; my $parsed_url = Nextflow::Utils::parse($url); - my $user = $parsed_url->{'user'}; - my $pass = $parsed_url->{'pass'}; - my $host = $parsed_url->{'host'}; - my $port = $parsed_url->{'port'}; - my $db = $parsed_url->{'dbname'}; - return ($user, $pass, $host, $port, $db); -} \ No newline at end of file + return @{$parsed_url}{qw(host port user pass dbname)}; +} diff --git a/scripts/xrefs/refseq_coordinate_parser.pl b/scripts/xrefs/refseq_coordinate_parser.pl index dae228391..39c9e4059 100644 --- a/scripts/xrefs/refseq_coordinate_parser.pl +++ b/scripts/xrefs/refseq_coordinate_parser.pl @@ -37,8 +37,8 @@ ); # Check that all parameters are passed -if (!defined($xref_db_url) || !defined($core_db_url) || !defined($otherf_db_url) || !defined($source_ids_json) || !defined($species_id) || !defined($species_name) || !defined($release)) { - croak "Usage: dump_ensembl.pl --xref_db_url --core_db_url --otherf_db_url --source_ids --species_id --species_name --release "; +foreach my $param ($xref_db_url, $core_db_url, $otherf_db_url, $source_ids_json, $species_id, $species_name, $release) { + defined $param or croak "Usage: refseq_coordinate_parser.pl --xref_db_url --core_db_url --otherf_db_url --source_ids --species_id --species_name --release "; } my $transcript_score_threshold = 0.75; @@ -48,13 +48,12 @@ my $source_ids = decode_json($source_ids_json); # Connect to the xref db -my ($user, $pass, $host, $port, $xref_db) = parse_url($xref_db_url); -my $dbi = get_dbi($host, $port, $user, $pass, $xref_db); +my $dbi = get_dbi(parse_url($xref_db_url)); # Load the registry my $registry = 'Bio::EnsEMBL::Registry'; -my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url); -my ($otherf_user, $otherf_pass, $otherf_host, $otherf_port, $otherf_dbname) = parse_url($otherf_db_url); +my ($core_host, $core_port, $core_user, $core_pass, $core_dbname) = parse_url($core_db_url); +my ($otherf_host, $otherf_port, $otherf_user, $otherf_pass, $otherf_dbname) = parse_url($otherf_db_url); $registry->load_registry_from_multiple_dbs( { -host => $core_host, @@ -73,11 +72,11 @@ ); # Get the EntrezGene and WikiGene accessions -my (%entrez_ids) = %{ get_valid_codes("EntrezGene", $species_id, $dbi) }; -my (%wiki_ids) = %{ get_valid_codes('WikiGene', $species_id, $dbi) }; +my %entrez_ids = %{ get_valid_codes("EntrezGene", $species_id, $dbi) }; +my %wiki_ids = %{ get_valid_codes('WikiGene', $species_id, $dbi) }; # Prepare link sql -my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?,?)"); +my $add_dependent_xref_sth = $dbi->prepare("INSERT IGNORE INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?,?)"); # Get the db adaptors my $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures'); @@ -98,8 +97,8 @@ # Not all species have refseq_import data, skip if not found if (!defined $logic_name) { - print STDERR "No data found for RefSeq_import, skipping import\n";; - exit 1; + print STDERR "No data found for RefSeq_import, skipping import\n"; + exit 0; } # Get otherfeatures chromosomes @@ -170,7 +169,7 @@ $start = $core_exon->seq_region_start(); $end = $core_exon->seq_region_end(); $overlap = $rr1->overlap_size('exon', $start, $end); - $core_exon_match += $overlap/($end - $start + 1); + $core_exon_match += $overlap / ($end - $start + 1); $rr2->check_and_register('exon', $start, $end); } @@ -178,30 +177,30 @@ $start = $core_tl_exon->seq_region_start(); $end = $core_tl_exon->seq_region_end(); $overlap = $rr3->overlap_size('exon', $start, $end); - $core_tl_exon_match += $overlap/($end - $start + 1); + $core_tl_exon_match += $overlap / ($end - $start + 1); $rr4->check_and_register('exon', $start, $end); } - # Look for oeverlap between the two sets of exons + # Look for overlap between the two sets of exons foreach my $otherf_exon (@$otherf_exons) { $start = $otherf_exon->seq_region_start(); $end = $otherf_exon->seq_region_end(); $overlap = $rr2->overlap_size('exon', $start, $end); - $otherf_exon_match += $overlap/($end - $start + 1); + $otherf_exon_match += $overlap / ($end - $start + 1); } foreach my $otherf_tl_exon (@$otherf_tl_exons) { $start = $otherf_tl_exon->seq_region_start(); $end = $otherf_tl_exon->seq_region_end(); $overlap = $rr4->overlap_size('exon', $start, $end); - $otherf_tl_exon_match += $overlap/($end - $start + 1); + $otherf_tl_exon_match += $overlap / ($end - $start + 1); } # Compare exon matching with number of exons to give a score - my $score = ( ($otherf_exon_match + $core_exon_match)) / (scalar(@$otherf_exons) + scalar(@$core_exons) ); + my $score = ($otherf_exon_match + $core_exon_match) / (scalar(@$otherf_exons) + scalar(@$core_exons)); my $tl_score = 0; if (scalar(@$otherf_tl_exons) > 0) { - $tl_score = ( ($otherf_tl_exon_match + $core_tl_exon_match)) / (scalar(@$otherf_tl_exons) + scalar(@$core_tl_exons) ); + $tl_score = ($otherf_tl_exon_match + $core_tl_exon_match) / (scalar(@$otherf_tl_exons) + scalar(@$core_tl_exons)); } if ($core_transcript->biotype eq $otherf_transcript->biotype) { $transcript_result{$core_transcript->stable_id} = $score; @@ -216,7 +215,7 @@ my ($best_id, $score, $tl_score); # Compare the scores based on coding exon overlap - # If there is a stale mate, chose best exon overlap score + # If there is a stale mate, choose best exon overlap score foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) { $score = $transcript_result{$tid}; $tl_score = $tl_transcript_result{$tid}; @@ -234,7 +233,7 @@ } } } - if (!defined $best_id) { + if (!defined $best_id) { if ($score >= $best_score) { $best_id = $tid; $best_score = $score; @@ -276,7 +275,7 @@ my $translation = $transcript->translation(); # Add link between Ensembl gene and EntrezGene (and WikiGene) - if (defined $entrez_ids{$entrez_id} ) { + if (defined $entrez_ids{$entrez_id}) { foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) { $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $source_ids->{'entrezgene'}); } @@ -288,7 +287,7 @@ # Also store refseq protein as direct xref for ensembl translation, if translation exists if (defined $translation && defined $otherf_translation && ($otherf_translation->seq eq $translation->seq)) { my $translation_id = $otherf_translation->stable_id(); - my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$otherf_translation->get_all_DBEntries}; + my @xrefs = grep { $_->{dbname} eq 'GenBank' } @{$otherf_translation->get_all_DBEntries}; if (scalar @xrefs == 1) { $translation_id = $xrefs[0]->primary_id(); } @@ -298,14 +297,14 @@ $source_id = $source_ids->{'peptide'}; $source_id = $source_ids->{'peptide_predicted'} if $acc =~ /^XP_/; my $tl_xref_id = add_xref({ - acc => $acc, - version => $version, - label => $translation_id, - desc => undef, - source_id => $source_id, + acc => $acc, + version => $version, + label => $translation_id, + desc => undef, + source_id => $source_id, species_id => $species_id, - dbi => $dbi, - info_type => 'DIRECT' + dbi => $dbi, + info_type => 'DIRECT' }); add_direct_xref($tl_xref_id, $translation->stable_id(), "Translation", "", $dbi); } @@ -316,39 +315,25 @@ sub parse_url { my ($url) = @_; - my $parsed_url = Nextflow::Utils::parse($url); - my $user = $parsed_url->{'user'}; - my $pass = $parsed_url->{'pass'}; - my $host = $parsed_url->{'host'}; - my $port = $parsed_url->{'port'}; - my $db = $parsed_url->{'dbname'}; - - return ($user, $pass, $host, $port, $db); + return @{$parsed_url}{qw(host port user pass dbname)}; } sub get_dbi { my ($host, $port, $user, $pass, $dbname) = @_; - - my $dbconn; - if (defined $dbname) { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); - } else { - $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); - } - my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); - + my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr); return $dbi; } -sub get_valid_codes{ +sub get_valid_codes { my ($source_name, $species_id, $dbi) = @_; my %valid_codes; my @sources; my $big_name = uc $source_name; - my $sql = "select source_id from source where upper(name) like '%$big_name%'"; + my $sql = "SELECT source_id FROM source WHERE UPPER(name) LIKE '%$big_name%'"; my $sth = $dbi->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ @@ -357,7 +342,7 @@ sub get_valid_codes{ $sth->finish; foreach my $source (@sources){ - $sql = "select accession, xref_id from xref where species_id = $species_id and source_id = $source"; + $sql = "SELECT accession, xref_id FROM xref WHERE species_id = $species_id AND source_id = $source"; $sth = $dbi->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ @@ -395,7 +380,7 @@ sub add_xref { return $xref_id; } - my $add_xref_sth = $dbi->prepare('INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type, info_text) VALUES(?,?,?,?,?,?,?,?)'); + my $add_xref_sth = $dbi->prepare('INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type, info_text) VALUES (?,?,?,?,?,?,?,?)'); # If the description is more than 255 characters, chop it off if (defined $description && ((length $description) > 255 )) { @@ -430,7 +415,7 @@ sub add_direct_xref { return; } -sub get_direct_xref{ +sub get_direct_xref { my ($stable_id, $type, $link, $dbi) = @_; $type = lc $type; diff --git a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py index a869c1266..c12ce0e6b 100644 --- a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py +++ b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py @@ -14,16 +14,17 @@ """Xref module to print out advisory datachecks results (only needed now since we are still using perl datachecks).""" -from ensembl.production.xrefs.Base import * +import re +from ensembl.production.xrefs.Base import Base class AdvisoryXrefReport(Base): def run(self): - base_path = self.param_required("base_path", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - datacheck_name = self.param("datacheck_name", None, {"type": "str"}) - datacheck_output = self.param("datacheck_output", None, {"type": "str"}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + datacheck_name: str = self.get_param("datacheck_name", {"type": str}) + datacheck_output: str = self.get_param("datacheck_output", {"type": str}) # Create or locate report file report_file = self.get_path( diff --git a/src/python/ensembl/production/xrefs/Alignment.py b/src/python/ensembl/production/xrefs/Alignment.py index b8ee417a1..5edac0b00 100644 --- a/src/python/ensembl/production/xrefs/Alignment.py +++ b/src/python/ensembl/production/xrefs/Alignment.py @@ -14,47 +14,64 @@ """Alignment module to map xref sequences into ensEMBL ones.""" -from ensembl.production.xrefs.Base import * +import re +import subprocess +from sqlalchemy.dialects.mysql import insert +from ensembl.xrefs.xref_update_db_model import ( + MappingJobs as MappingJobsORM, + Mapping as MappingORM, +) + +from ensembl.production.xrefs.Base import Base class Alignment(Base): + XREF_HIT_PATTERN = re.compile(r"^xref") + def run(self): - base_path = self.param_required("base_path", {"type": "str"}) - method = self.param_required("align_method", {"type": "str"}) - query_cutoff = self.param_required("query_cutoff", {"type": "int"}) - target_cutoff = self.param_required("target_cutoff", {"type": "int"}) - max_chunks = self.param_required("max_chunks", {"type": "int"}) - chunk = self.param_required("chunk", {"type": "int"}) - job_index = self.param_required("job_index", {"type": "int"}) - source = self.param_required("source_file", {"type": "str"}) - target = self.param_required("target_file", {"type": "str"}) - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - map_file = self.param_required("map_file", {"type": "str"}) - source_id = self.param_required("source_id", {"type": "int"}) - seq_type = self.param_required("seq_type", {"type": "str"}) + method: str = self.get_param("align_method", {"required": True, "type": str}) + query_cutoff: int = self.get_param("query_cutoff", {"required": True, "type": int}) + target_cutoff: int = self.get_param("target_cutoff", {"required": True, "type": int}) + max_chunks: int = self.get_param("max_chunks", {"required": True, "type": int}) + chunk: int = self.get_param("chunk", {"required": True, "type": int}) + job_index: int = self.get_param("job_index", {"required": True, "type": int}) + source: str = self.get_param("source_file", {"required": True, "type": str}) + target: str = self.get_param("target_file", {"required": True, "type": str}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + map_file: str = self.get_param("map_file", {"required": True, "type": str}) + source_id: int = self.get_param("source_id", {"required": True, "type": int}) + seq_type: str = self.get_param("seq_type", {"required": True, "type": str}) # Construct Exonerate command ryo = "xref:%qi:%ti:%ei:%ql:%tl:%qab:%qae:%tab:%tae:%C:%s\n" - exe = ( - subprocess.check_output("which exonerate", shell=True) - .decode("utf-8") - .strip() - ) - command_string = f"{exe} --showalignment FALSE --showvulgar FALSE --ryo '{ryo}' --gappedextension FALSE --model 'affine:local' {method} --subopt no --query {source} --target {target} --querychunktotal {max_chunks} --querychunkid {chunk}" + exe = subprocess.check_output(["which", "exonerate"]).decode("utf-8").strip() + command_string = [ + exe, + "--showalignment", "FALSE", + "--showvulgar", "FALSE", + "--ryo", f"'{ryo}'", + "--gappedextension", "FALSE", + "--model", "'affine:local'", + method, + "--subopt", "no", + "--query", source, + "--target", target, + "--querychunktotal", str(max_chunks), + "--querychunkid", str(chunk) + ] # Get exonerate hits - output = subprocess.run(command_string, shell=True, stdout=subprocess.PIPE) + output = subprocess.run(command_string, stdout=subprocess.PIPE, text=True) exit_code = abs(output.returncode) if exit_code == 0: - hits = output.stdout.decode("utf-8").split("\n") + hits = output.stdout.split("\n") # Write to mapping file - map_fh = open(map_file, "w") - for hit in hits: - if re.search(r"^xref", hit): - map_fh.write(f"{hit}\n") - map_fh.close() + with open(map_file, "w") as map_fh: + for hit in hits: + if self.XREF_HIT_PATTERN.search(hit): + map_fh.write(f"{hit}\n") elif exit_code == 9: raise MemoryError( f"Exonerate failed due to insufficient memory (exit code: {exit_code})" diff --git a/src/python/ensembl/production/xrefs/CoordinateMapping.py b/src/python/ensembl/production/xrefs/CoordinateMapping.py index d687ebee1..332e06c6e 100644 --- a/src/python/ensembl/production/xrefs/CoordinateMapping.py +++ b/src/python/ensembl/production/xrefs/CoordinateMapping.py @@ -14,26 +14,27 @@ """Xref module to process the coordinate mappings.""" -from ensembl.production.xrefs.Base import * -from ensembl.production.xrefs.mappers.CoordinateMapper import CoordinateMapper +import logging +from typing import Optional +from ensembl.production.xrefs.Base import Base +from ensembl.production.xrefs.mappers.CoordinateMapper import CoordinateMapper class CoordinateMapping(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - scripts_dir = self.param_required("perl_scripts_dir", {"type": "str"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + scripts_dir: str = self.get_param("perl_scripts_dir", {"required": True, "type": str}) + registry: Optional[str] = self.get_param("registry_url", {"type": str}) + core_db_url: Optional[str] = self.get_param("species_db", {"type": str}) logging.info(f"CoordinateMapping starting for species '{species_name}'") + # Retrieve core database URL if not provided if not core_db_url: - core_db_url = self.get_db_from_registry( - species_name, "core", release, registry - ) + core_db_url = self.get_db_from_registry(species_name, "core", release, registry) # Get species id db_engine = self.get_db_engine(core_db_url) @@ -41,9 +42,7 @@ def run(self): species_id = self.get_taxon_id(core_dbi) # Get the appropriate mapper - mapper = self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) # Process the coordinate xrefs coord = CoordinateMapper(mapper) diff --git a/src/python/ensembl/production/xrefs/DirectXrefs.py b/src/python/ensembl/production/xrefs/DirectXrefs.py index f6522b274..121f4f897 100644 --- a/src/python/ensembl/production/xrefs/DirectXrefs.py +++ b/src/python/ensembl/production/xrefs/DirectXrefs.py @@ -14,25 +14,25 @@ """Xref module to process direct xrefs.""" -from ensembl.production.xrefs.Base import * -from ensembl.production.xrefs.mappers.DirectXrefsMapper import DirectXrefsMapper +import logging +from typing import Optional +from ensembl.production.xrefs.Base import Base +from ensembl.production.xrefs.mappers.DirectXrefsMapper import DirectXrefsMapper class DirectXrefs(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + registry: Optional[str] = self.get_param("registry_url", {"type": str}) + core_db_url: Optional[str] = self.get_param("species_db", {"type": str}) logging.info(f"DirectXrefs starting for species '{species_name}'") # Get the appropriate mapper - mapper = self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) # Process the direct xrefs direct_mappings = DirectXrefsMapper(mapper) diff --git a/src/python/ensembl/production/xrefs/DumpEnsembl.py b/src/python/ensembl/production/xrefs/DumpEnsembl.py index 84ce39b47..c34635f6d 100644 --- a/src/python/ensembl/production/xrefs/DumpEnsembl.py +++ b/src/python/ensembl/production/xrefs/DumpEnsembl.py @@ -14,68 +14,65 @@ """Dumping module to dump sequence data from a core db.""" -from ensembl.production.xrefs.Base import * +import os +import subprocess +import logging +from ensembl.production.xrefs.Base import Base class DumpEnsembl(Base): def run(self): - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - core_db_url = self.param_required("species_db", {"type": "str"}) - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - retry = self.param("retry", None, {"type": "bool", "default": False}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + core_db_url: str = self.get_param("species_db", {"required": True, "type": str}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + retry: bool = self.get_param("retry", {"type": bool, "default": False}) logging.info(f"DumpEnsembl starting for species '{species_name}'") - # Create files paths - cdna_path = self.get_path( - base_path, species_name, release, "ensembl", "transcripts.fa" - ) - pep_path = self.get_path( - base_path, species_name, release, "ensembl", "peptides.fa" - ) + # Create file paths + cdna_path = self.get_path(base_path, species_name, release, "ensembl", "transcripts.fa") + pep_path = self.get_path(base_path, species_name, release, "ensembl", "peptides.fa") # Check if dumping has been done for this run before, to speed up development by not having to re-dump sequences - if ( - not retry - and os.path.exists(cdna_path) - and os.path.getsize(cdna_path) > 0 - and os.path.exists(pep_path) - and os.path.getsize(pep_path) > 0 - ): - logging.info( - f"Dna and peptide data already dumped for species '{species_name}', skipping." - ) + if not retry and os.path.exists(cdna_path) and os.path.getsize(cdna_path) > 0 and os.path.exists(pep_path) and os.path.getsize(pep_path) > 0: + logging.info(f"Dna and peptide data already dumped for species '{species_name}', skipping.") else: - scripts_dir = self.param_required("perl_scripts_dir") + scripts_dir: str = self.get_param("perl_scripts_dir", {"required": True, "type": str}) logging.info(f"Running perl script {scripts_dir}/dump_ensembl.pl") - perl_cmd = f"perl {scripts_dir}/dump_ensembl.pl --cdna_path '{cdna_path}' --pep_path '{pep_path}' --species {species_name} --core_db_url '{core_db_url}' --release {release}" - cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE) + perl_cmd = [ + "perl", + f"{scripts_dir}/dump_ensembl.pl", + "--cdna_path", cdna_path, + "--pep_path", pep_path, + "--species", species_name, + "--core_db_url", core_db_url, + "--release", str(release) + ] + # subprocess.run(perl_cmd, check=True, stdout=subprocess.PIPE) + subprocess.run(perl_cmd, capture_output=True, text=True, check=True) # Create jobs for peptide dumping and alignment - dataflow_params = { + self.write_output("dump_xref", { "species_name": species_name, "file_path": pep_path, "xref_db_url": xref_db_url, "seq_type": "peptide", - } - self.write_output("dump_xref", dataflow_params) + }) # Create jobs for cdna dumping and alignment - dataflow_params = { + self.write_output("dump_xref", { "species_name": species_name, "file_path": cdna_path, "xref_db_url": xref_db_url, "seq_type": "dna", - } - self.write_output("dump_xref", dataflow_params) + }) # Create job for schedule mapping - dataflow_params = { + self.write_output("schedule_mapping", { "species_name": species_name, "xref_db_url": xref_db_url, "species_db": core_db_url, - } - self.write_output("schedule_mapping", dataflow_params) + }) diff --git a/src/python/ensembl/production/xrefs/DumpXref.py b/src/python/ensembl/production/xrefs/DumpXref.py index 268c8cae2..94b7c1a04 100644 --- a/src/python/ensembl/production/xrefs/DumpXref.py +++ b/src/python/ensembl/production/xrefs/DumpXref.py @@ -14,22 +14,37 @@ """Dumping module to dump xref sequence data from an xref intermediate db.""" -from ensembl.production.xrefs.Base import * - +import json +import logging +import os +import re +from sqlalchemy import select from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from ensembl.xrefs.xref_update_db_model import ( + Source as SourceUORM, + Xref as XrefUORM, + PrimaryXref as PrimaryXrefORM, +) + +from ensembl.production.xrefs.Base import Base class DumpXref(Base): + REFSEQ_DNA_PATTERN = re.compile(r"RefSeq_.*RNA") + REFSEQ_PEP_PATTERN = re.compile(r"RefSeq_peptide") + FILE_NAME_PATTERN = re.compile(r"\/") + SEQUENCE_PATTERN = re.compile(r"(J|O|U)") + def run(self): - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - file_path = self.param_required("file_path", {"type": "str"}) - seq_type = self.param_required("seq_type", {"type": "str"}) - config_file = self.param_required("config_file", {"type": "str"}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + file_path: str = self.get_param("file_path", {"required": True, "type": str}) + seq_type: str = self.get_param("seq_type", {"required": True, "type": str}) + config_file: str = self.get_param("config_file", {"required": True, "type": str}) logging.info( f"DumpXref starting for species '{species_name}' with file_path '{file_path}' and seq_type '{seq_type}'" @@ -42,19 +57,13 @@ def run(self): full_path = self.get_path(base_path, species_name, release, "xref") # Extract sources to download from config file - sources = [] with open(config_file) as conf_file: sources = json.load(conf_file) # Create hash of available alignment methods - method = {} - query_cutoff = {} - target_cutoff = {} - for source in sources: - if source.get("method"): - method[source["name"]] = source["method"] - query_cutoff[source["name"]] = source.get("query_cutoff") - target_cutoff[source["name"]] = source.get("target_cutoff") + method = {source["name"]: source["method"] for source in sources if source.get("method")} + query_cutoff = {source["name"]: source.get("query_cutoff") for source in sources if source.get("method")} + target_cutoff = {source["name"]: source.get("target_cutoff") for source in sources if source.get("method")} job_index = 1 @@ -68,46 +77,42 @@ def run(self): source_name = source.name source_id = source.source_id - if re.search(r"RefSeq_.*RNA", source_name): + if self.REFSEQ_DNA_PATTERN.search(source_name): source_name = "RefSeq_dna" - if re.search("RefSeq_peptide", source_name): + if self.REFSEQ_PEP_PATTERN.search(source_name): source_name = "RefSeq_peptide" - if method.get(source_name): + if source_name in method: method_name = method[source_name] source_query_cutoff = query_cutoff[source_name] source_target_cutoff = target_cutoff[source_name] # Open fasta file - file_source_name = source.name - file_source_name = re.sub(r"\/", "", file_source_name) + file_source_name = self.FILE_NAME_PATTERN.sub("", source.name) filename = os.path.join( full_path, f"{seq_type}_{file_source_name}_{source_id}.fasta" ) - fasta_fh = open(filename, "w") - - # Get xref sequences - sequence_query = select( - PrimaryXrefORM.xref_id, PrimaryXrefORM.sequence - ).where( - XrefUORM.xref_id == PrimaryXrefORM.xref_id, - PrimaryXrefORM.sequence_type == seq_type, - XrefUORM.source_id == source_id, - ) - for sequence in xref_dbi.execute(sequence_query).mappings().all(): - # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes - seq = sequence.sequence.upper() - if seq_type == "peptide": - seq = re.sub(r"(J|O|U)", "X", seq) - - # Print sequence - SeqIO.write( - SeqRecord(Seq(seq), id=str(sequence.xref_id), description=""), - fasta_fh, - "fasta", + with open(filename, "w") as fasta_fh: + # Get xref sequences + sequence_query = select( + PrimaryXrefORM.xref_id, PrimaryXrefORM.sequence + ).where( + XrefUORM.xref_id == PrimaryXrefORM.xref_id, + PrimaryXrefORM.sequence_type == seq_type, + XrefUORM.source_id == source_id, ) - - fasta_fh.close() + for sequence in xref_dbi.execute(sequence_query).mappings().all(): + # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes + seq = sequence.sequence.upper() + if seq_type == "peptide": + seq = self.SEQUENCE_PATTERN.sub("X", seq) + + # Print sequence + SeqIO.write( + SeqRecord(Seq(seq), id=str(sequence.xref_id), description=""), + fasta_fh, + "fasta", + ) # Pass data into alignment jobs self.write_output( diff --git a/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py index 3513c7afc..cc015ee0b 100644 --- a/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py +++ b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py @@ -14,20 +14,22 @@ """Email module to send user emails notifying of advisory DC failures.""" -from ensembl.production.xrefs.Base import * +import os +import re from smtplib import SMTP from email.message import EmailMessage +from ensembl.production.xrefs.Base import Base class EmailAdvisoryXrefReport(Base): def run(self): - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - pipeline_name = self.param_required("pipeline_name", {"type": "str"}) - email_address = self.param_required("email", {"type": "str"}) - email_server = self.param_required("email_server", {"type": "str"}) - log_timestamp = self.param("log_timestamp", None, {"type": "str"}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + pipeline_name: str = self.get_param("pipeline_name", {"required": True, "type": str}) + email_address: str = self.get_param("email", {"required": True, "type": str}) + email_server: str = self.get_param("email_server", {"required": True, "type": str}) + log_timestamp: str = self.get_param("log_timestamp", {"type": str}) # Get the path and name of main reports file formatted_name = re.sub(r"\s", "_", pipeline_name) @@ -38,40 +40,38 @@ def run(self): else: log_path = os.path.join(base_path, "logs") if not os.path.exists(log_path): - os.makedir(log_path) + os.makedirs(log_path) main_report_file_name = f"{main_report_file_name}.log" main_report_file = os.path.join(log_path, main_report_file_name) - main_fh = open(main_report_file, "a") + with open(main_report_file, "a") as main_fh: - species_with_reports = {} + species_with_reports = {} - # Get species in base path - species_list = os.listdir(base_path) + # Get species in base path + species_list = os.listdir(base_path) - for species in species_list: - # Check if reports exist - dc_path = os.path.join(base_path, species, release, "dc_report") - if os.path.exists(dc_path): - # Get report files - dc_files = os.listdir(dc_path) + for species in species_list: + # Check if reports exist + dc_path = os.path.join(base_path, species, str(release), "dc_report") + if os.path.exists(dc_path): + # Get report files + dc_files = os.listdir(dc_path) - # Add each dc report into main report file - for dc_file in dc_files: - with open(os.path.join(dc_path, dc_file), "r") as file: - dc_data = file.read() + # Add each dc report into main report file + for dc_file in dc_files: + with open(os.path.join(dc_path, dc_file), "r") as file: + dc_data = file.read() - main_fh.write(f"{dc_data}\n") + main_fh.write(f"{dc_data}\n") - dc_name = dc_file.replace(".log", "") - if species_with_reports.get(dc_name): - species_with_reports[dc_name].append(species) - else: - species_with_reports[dc_name] = [species] + dc_name = dc_file.replace(".log", "") + if dc_name in species_with_reports: + species_with_reports[dc_name].append(species) + else: + species_with_reports[dc_name] = [species] - # TO DO: maybe delete individual reports - - main_fh.close() + # TO DO: maybe delete individual reports email_message = f"Some advisory datachecks have failed for the following species in the xref pipeline run ({pipeline_name}).

" for dc_name, species_list in species_with_reports.items(): @@ -96,5 +96,5 @@ def run(self): file_data, maintype="text", subtype="plain", filename=main_report_file_name ) - smtp = SMTP(email_server) - smtp.send_message(message) + with SMTP(email_server) as smtp: + smtp.send_message(message) diff --git a/src/python/ensembl/production/xrefs/Mapping.py b/src/python/ensembl/production/xrefs/Mapping.py index 838470c1a..a6f956d03 100644 --- a/src/python/ensembl/production/xrefs/Mapping.py +++ b/src/python/ensembl/production/xrefs/Mapping.py @@ -14,7 +14,9 @@ """Mapping module to map the added xrefs into the core DB.""" -from ensembl.production.xrefs.Base import * +import logging + +from ensembl.production.xrefs.Base import Base from ensembl.production.xrefs.mappers.ProcessPriorities import ProcessPriorities from ensembl.production.xrefs.mappers.ProcessPaired import ProcessPaired from ensembl.production.xrefs.mappers.ProcessMoves import ProcessMoves @@ -23,16 +25,15 @@ from ensembl.production.xrefs.mappers.XrefLoader import XrefLoader from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs - class Mapping(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) - verbose = self.param("verbose", None, {"default": False}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + registry: str = self.get_param("registry_url", {"type": str}) + core_db_url: str = self.get_param("species_db", {"type": str}) + verbose: bool = self.get_param("verbose", {"type": bool, "default": False}) logging.info(f"Mapping starting for species '{species_name}'") @@ -47,9 +48,7 @@ def run(self): species_id = self.get_taxon_id(core_dbi) # Get the appropriate mapper - mapper = self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) # Process the xref priorities priorities = ProcessPriorities(mapper) diff --git a/src/python/ensembl/production/xrefs/ParseSource.py b/src/python/ensembl/production/xrefs/ParseSource.py index d3024fe20..eb35119b0 100644 --- a/src/python/ensembl/production/xrefs/ParseSource.py +++ b/src/python/ensembl/production/xrefs/ParseSource.py @@ -14,23 +14,27 @@ """Parsing module to call specific file/db parsers based on xref source.""" -from ensembl.production.xrefs.Base import * +import logging +import re +import importlib +from typing import Optional +from ensembl.production.xrefs.Base import Base class ParseSource(Base): - def run(self): - parser_name = self.param_required("parser", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - species_id = self.param_required("species_id", {"type": "int"}) - file_name = self.param_required("file_name", {"type": "str"}) - source_id = self.param_required("source_id", {"type": "int"}) - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - registry = self.param_required("registry_url", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - core_db_url = self.param_required("core_db_url", {"type": "str"}) - db = self.param("db", None, {"type": "str"}) - release_file = self.param("release_file", None, {"type": "str"}) - source_name = self.param("source_name", None, {"type": "str"}) + def run(self) -> None: + parser_name: str = self.get_param("parser", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + species_id: int = self.get_param("species_id", {"required": True, "type": int}) + file_name: str = self.get_param("file_name", {"required": True, "type": str}) + source_id: int = self.get_param("source_id", {"required": True, "type": int}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + registry_url: str = self.get_param("registry_url", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + core_db_url: str = self.get_param("core_db_url", {"required": True, "type": str}) + db: Optional[str] = self.get_param("db", {"type": str}) + release_file: Optional[str] = self.get_param("release_file", {"type": str}) + source_name: Optional[str] = self.get_param("source_name", {"type": str}) logging.info( f"ParseSource starting for source '{source_name}' with parser '{parser_name}' for species '{species_name}'" @@ -54,22 +58,22 @@ def run(self): # Get the extra db, if any if db: - dba = self.param(f"{db}_db_url") - if not dba: - dba = self.get_db_from_registry(species_name, db, release, registry) + db_url = self.get_param(f"{db}_db_url", {"type": str}) + if not db_url: + db_url = self.get_db_from_registry(species_name, db, release, registry_url) - args["dba"] = dba + args["extra_db_url"] = db_url args["ensembl_release"] = release args["core_db_url"] = core_db_url # For RefSeqCoordinate source, we run a perl script if parser_name == "RefSeqCoordinateParser": - args["perl_scripts_dir"] = self.param_required("perl_scripts_dir") + args["perl_scripts_dir"] = self.get_param("perl_scripts_dir", {"required": True, "type": str}) args["xref_db_url"] = xref_db_url # For UniProt we need the hgnc file to extract descriptions if re.search(r"^UniProt", parser_name): - args['hgnc_file'] = self.param("hgnc_file", None, {"type": "str"}) + args['hgnc_file'] = self.get_param("hgnc_file", {"type": str}) # Import the parser module_name = f"ensembl.production.xrefs.parsers.{parser_name}" @@ -77,7 +81,7 @@ def run(self): parser_class = getattr(module, parser_name) parser = parser_class() - (errors, message) = parser.run(args) + errors, message = parser.run(args) failure += errors xref_dbi.close() diff --git a/src/python/ensembl/production/xrefs/ProcessAlignment.py b/src/python/ensembl/production/xrefs/ProcessAlignment.py index 1f2295d43..289b37166 100644 --- a/src/python/ensembl/production/xrefs/ProcessAlignment.py +++ b/src/python/ensembl/production/xrefs/ProcessAlignment.py @@ -12,25 +12,24 @@ """Xref module to process the sequence matched allignments.""" -from ensembl.production.xrefs.Base import * -from ensembl.production.xrefs.mappers.ProcessMappings import ProcessMappings +import logging +from ensembl.production.xrefs.Base import Base +from ensembl.production.xrefs.mappers.ProcessMappings import ProcessMappings class ProcessAlignment(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + registry: str = self.get_param("registry_url", {"type": str}) + core_db_url: str = self.get_param("species_db", {"type": str}) logging.info(f"ProcessAlignment starting for species '{species_name}'") # Get the appropriate mapper - mapper = self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) # Process the alignments mappings = ProcessMappings(mapper) diff --git a/src/python/ensembl/production/xrefs/RNACentralMapping.py b/src/python/ensembl/production/xrefs/RNACentralMapping.py index e71353f50..16646495d 100644 --- a/src/python/ensembl/production/xrefs/RNACentralMapping.py +++ b/src/python/ensembl/production/xrefs/RNACentralMapping.py @@ -14,27 +14,26 @@ """Xref module to process the RNAcentral mappings.""" -from ensembl.production.xrefs.Base import * +import logging + +from ensembl.production.xrefs.Base import Base from ensembl.production.xrefs.mappers.RNACentralMapper import RNACentralMapper from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum - class RNACentralMapping(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + registry: str = self.get_param("registry_url", {"type": str}) + core_db_url: str = self.get_param("species_db", {"type": str}) logging.info(f"RNACentralMapping starting for species '{species_name}'") if not core_db_url: - core_db_url = self.get_db_from_registry( - species_name, "core", release, registry - ) + core_db_url = self.get_db_from_registry(species_name, "core", release, registry) # Get species id db_engine = self.get_db_engine(core_db_url) @@ -43,15 +42,13 @@ def run(self): # Get the rna central mapper mapper = RNACentralMapper( - self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) ) # Get source id db_engine = self.get_db_engine(source_db_url) with db_engine.connect() as source_dbi: - source_id = self.get_source_id_from_name(source_dbi, "RNACentral") + source_id = self.get_source_id_from_name("RNACentral", source_dbi) method = MySQLChecksum({"MAPPER": mapper}) results = method.run( diff --git a/src/python/ensembl/production/xrefs/ScheduleAlignment.py b/src/python/ensembl/production/xrefs/ScheduleAlignment.py index d1fca7697..36787edc4 100644 --- a/src/python/ensembl/production/xrefs/ScheduleAlignment.py +++ b/src/python/ensembl/production/xrefs/ScheduleAlignment.py @@ -14,24 +14,28 @@ """Scheduling module to create xref/ensEMBL alignment jobs.""" -from ensembl.production.xrefs.Base import * +import logging +import os +from typing import Optional +from ensembl.production.xrefs.Base import Base class ScheduleAlignment(Base): def run(self): - species_name = self.param_required("species_name", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - target_file = self.param_required("ensembl_fasta", {"type": "str"}) - source_file = self.param_required("xref_fasta", {"type": "str"}) - seq_type = self.param_required("seq_type", {"type": "str"}) - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - method = self.param_required("method", {"type": "str"}) - query_cutoff = self.param_required("query_cutoff", {"type": "int"}) - target_cutoff = self.param_required("target_cutoff", {"type": "int"}) - source_id = self.param_required("source_id", {"type": "int"}) - source_name = self.param_required("source_name", {"type": "str"}) - job_index = self.param_required("job_index", {"type": "int"}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + target_file: str = self.get_param("ensembl_fasta", {"required": True, "type": str}) + source_file: str = self.get_param("xref_fasta", {"required": True, "type": str}) + seq_type: str = self.get_param("seq_type", {"required": True, "type": str}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + method: str = self.get_param("method", {"required": True, "type": str}) + query_cutoff: int = self.get_param("query_cutoff", {"required": True, "type": int}) + target_cutoff: int = self.get_param("target_cutoff", {"required": True, "type": int}) + source_id: int = self.get_param("source_id", {"required": True, "type": int}) + source_name: str = self.get_param("source_name", {"required": True, "type": str}) + job_index: int = self.get_param("job_index", {"required": True, "type": int}) + chunk_size: Optional[int] = self.get_param("chunk_size", {"type": int, "default": 1000000}) logging.info( f"ScheduleAlignment starting for species '{species_name}' with seq_type '{seq_type}' and job_index '{job_index}'" @@ -39,14 +43,14 @@ def run(self): # Inspect file size to decide on chunking size = os.stat(target_file).st_size - chunks = int(size / 1000000) + 1 + chunks = int(size / chunk_size) + 1 # Create output path output_path = self.get_path(base_path, species_name, release, "alignment") # Pass alignment data for each chunk chunklet = 1 - while chunklet <= chunks: + for chunklet in range(1, chunks + 1): output_path_chunk = os.path.join( output_path, f"{seq_type}_alignment_{source_id}_{chunklet}_of_{chunks}.map", @@ -70,4 +74,3 @@ def run(self): "seq_type": seq_type, }, ) - chunklet += 1 diff --git a/src/python/ensembl/production/xrefs/ScheduleMapping.py b/src/python/ensembl/production/xrefs/ScheduleMapping.py index 44032ad76..94085ce11 100644 --- a/src/python/ensembl/production/xrefs/ScheduleMapping.py +++ b/src/python/ensembl/production/xrefs/ScheduleMapping.py @@ -14,25 +14,24 @@ """Scheduling module to xref mapping jobs.""" -from ensembl.production.xrefs.Base import * -from ensembl.production.xrefs.mappers.CoreInfo import CoreInfo +import logging +from ensembl.production.xrefs.Base import Base +from ensembl.production.xrefs.mappers.CoreInfo import CoreInfo class ScheduleMapping(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + registry: str = self.get_param("registry_url", {"type": str}) + core_db_url: str = self.get_param("species_db", {"type": str}) logging.info(f"ScheduleMapping starting for species '{species_name}'") # Get the appropriate mapper - mapper = self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) # Load the core data logging.info("Loading core data") diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py index cf044e1ee..149eb1c71 100644 --- a/src/python/ensembl/production/xrefs/ScheduleParse.py +++ b/src/python/ensembl/production/xrefs/ScheduleParse.py @@ -15,20 +15,30 @@ """Scheduling module to create parsing jobs for each xref source.""" import glob +import logging +import os +import re +from sqlalchemy import select +from sqlalchemy.engine.url import make_url +from typing import Tuple, Optional -from ensembl.production.xrefs.Base import * +from ensembl.xrefs.xref_source_db_model import ( + Source as SourceSORM, + Version as VersionORM, +) +from ensembl.production.xrefs.Base import Base class ScheduleParse(Base): def run(self): - species_name = self.param_required("species_name", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - registry = self.param_required("registry_url", {"type": "str"}) - order_priority = self.param_required("priority", {"type": "int"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - get_species_file = self.param_required("get_species_file", {"type": "bool"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + registry: str = self.get_param("registry_url", {"required": True, "type": str}) + order_priority: int = self.get_param("priority", {"required": True, "type": int}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + get_species_file: bool = self.get_param("get_species_file", {"required": True, "type": bool}) + core_db_url: Optional[str] = self.get_param("species_db", {"type": str}) logging.info(f"ScheduleParse starting for species '{species_name}'") logging.info(f"\tParam: order_priority = {order_priority}") @@ -40,14 +50,30 @@ def run(self): # Create Xref database only at priority 1 (one time) if order_priority == 1: - sources_config_file = self.param_required("sources_config_file") + sources_config_file: str = self.get_param("sources_config_file", {"required": True, "type": str}) logging.info(f"\tParam: sources_config_file = {sources_config_file}") + # Construct xref update db name, truncating if necessary + max_length = 64 + original_species_name = species_name + xref_db_name = f"{species_name}_xref_update_{release}" + if len(xref_db_name) > max_length: + # Try to shorten the name by replacing "_collection" with "_col" + if species_name.endswith("_collection"): + species_name = species_name.replace("_collection", "_col") + xref_db_name = f"{species_name}_xref_update_{release}" + + # If still too long, truncate the _xref_update_ part + if len(xref_db_name) > max_length: + xref_db_name = f"{species_name}_xup_{release}" + + # If still too long, raise an error + if len(xref_db_name) > max_length: + raise ValueError(f"Could not sufficiently reduce DB name for species {species_name}") + species_name = original_species_name + # Construct xref update url - xref_db_url = make_url(xref_db_url) - xref_db_url = xref_db_url.set( - database=f"{species_name}_xref_update_{release}" - ) + xref_db_url = make_url(xref_db_url).set(database=xref_db_name) self.create_xref_db(xref_db_url, sources_config_file) xref_db_url = xref_db_url.render_as_string(hide_password=False) @@ -68,13 +94,10 @@ def run(self): species_name, "core", release, registry ) if not re.search(r"^mysql://", core_db_url): - core_db_url = "mysql://" + core_db_url + core_db_url = f"mysql://{core_db_url}" # Get species and division ids - db_engine = self.get_db_engine(core_db_url) - with db_engine.connect() as core_dbi: - species_id = self.get_taxon_id(core_dbi) - division_id = self.get_division_id(core_dbi) + species_id, division_id = self.get_core_db_info(core_db_url) # Retrieve list of sources from source database db_engine = self.get_db_engine(source_db_url) @@ -104,9 +127,7 @@ def run(self): if source.name == "HGNC": hgnc_path = source.file_path - if source.db == "checksum": - continue - if source.priority != order_priority: + if source.db == "checksum" or source.priority != order_priority: continue dataflow_params = { @@ -117,9 +138,7 @@ def run(self): } # Use clean files if available - file_name = source.file_path - if source.clean_path: - file_name = source.clean_path + file_name = source.clean_path or source.file_path # Some sources are species-specific source_id = self.get_source_id( @@ -128,11 +147,12 @@ def run(self): if not source_id: continue - dataflow_params["source_id"] = source_id - dataflow_params["source_name"] = source.name - dataflow_params["parser"] = source.parser - if source.revision: - dataflow_params["release_file"] = source.revision + dataflow_params.update({ + "source_id": source_id, + "source_name": source.name, + "parser": source.parser, + "release_file": source.revision if source.revision else None, + }) # Some sources need a connection to a special database if source.db: @@ -163,30 +183,24 @@ def run(self): total_sources += 1 else: # Get list of files if directory - if os.path.isdir(file_name): - list_files = os.listdir(file_name) - list_files = [os.path.join(file_name, f) for f in list_files] - else: - list_files = [file_name] + list_files = ( + [os.path.join(file_name, f) for f in os.listdir(file_name)] + if os.path.isdir(file_name) + else [file_name] + ) # For Uniprot and Refseq, files might have been split by species if get_species_file: - match source.name: - case "Uniprot/SWISSPROT": - file_prefix = "uniprot_sprot" - case "Uniprot/SPTREMBL": - file_prefix = "uniprot_trembl" - case "RefSeq_dna": - file_prefix = "refseq_rna" - case "RefSeq_peptide": - file_prefix = "refseq_protein" - case _: - file_prefix = None + file_prefix = { + "Uniprot/SWISSPROT": "uniprot_sprot", + "Uniprot/SPTREMBL": "uniprot_trembl", + "RefSeq_dna": "refseq_rna", + "RefSeq_peptide": "refseq_protein", + }.get(source.name) if file_prefix: list_files = glob.glob( - file_name + "/**/" + file_prefix + "-" + str(species_id), - recursive=True, + f"{file_name}/**/{file_prefix}-{species_id}", recursive=True ) if source.name == "ZFIN_ID": @@ -198,7 +212,8 @@ def run(self): dataflow_params["file_name"] = file - if re.search(r"^Uniprot", source.name): + if re.search(r"^Uniprot", source.name) and hgnc_path: + hgnc_files = glob.glob(hgnc_path + "/*") dataflow_params["hgnc_file"] = hgnc_files[0] @@ -208,8 +223,7 @@ def run(self): xref_dbi.close() if total_sources == 0: - with open(f"dataflow_{dataflow_suffix}.json", "a") as fh: - fh.write("") + self.write_output(dataflow_suffix, {}) dataflow_params = { "species_name": species_name, @@ -217,3 +231,11 @@ def run(self): "xref_db_url": xref_db_url, } self.write_output(dataflow_sub_suffix, dataflow_params) + + def get_core_db_info(self, core_db_url: str) -> Tuple[int, int]: + db_engine = self.get_db_engine(core_db_url) + with db_engine.connect() as core_dbi: + species_id = self.get_taxon_id(core_dbi) + division_id = self.get_division_id(core_dbi) + + return species_id, division_id diff --git a/src/python/ensembl/production/xrefs/ScheduleSpecies.py b/src/python/ensembl/production/xrefs/ScheduleSpecies.py index e63de241a..9537fa452 100644 --- a/src/python/ensembl/production/xrefs/ScheduleSpecies.py +++ b/src/python/ensembl/production/xrefs/ScheduleSpecies.py @@ -14,20 +14,24 @@ """Scheduling module to create a pipeline branch for each species in list or division.""" -from ensembl.production.xrefs.Base import * +import logging +import re +import requests +from typing import List, Dict, Optional +from ensembl.production.xrefs.Base import Base class ScheduleSpecies(Base): def run(self): - run_all = self.param_required("run_all", {"type": "bool"}) - registry = self.param_required("registry_url", {"type": "str"}) - ensembl_release = self.param_required("release", {"type": "int"}) - metasearch_url = self.param_required("metasearch_url", {"type": "str"}) - species = self.param("species", None, {"default": "", "type": "str"}) - antispecies = self.param("antispecies", None, {"default": "", "type": "str"}) - division = self.param("division", None, {"default": "", "type": "str"}) - db_prefix = self.param("db_prefix", None, {"type": "str"}) - group = self.param("group", None, {"default": "core", "type": "str"}) + run_all: bool = self.get_param("run_all", {"required": True, "type": bool}) + registry: str = self.get_param("registry_url", {"required": True, "type": str}) + ensembl_release: int = self.get_param("release", {"required": True, "type": int}) + metasearch_url: str = self.get_param("metasearch_url", {"required": True, "type": str}) + species: list = self.get_param("species", {"default": [], "type": list}) + antispecies: list = self.get_param("antispecies", {"default": [], "type": list}) + division: list = self.get_param("division", {"default": [], "type": list}) + group: str = self.get_param("group", {"default": "core", "type": str}) + db_prefix: Optional[str] = self.get_param("db_prefix", {"type": str}) logging.info("ScheduleSpecies starting with parameters:") logging.info(f"\tParam: run_all = {run_all}") @@ -40,102 +44,49 @@ def run(self): logging.info(f"\tParam: db_prefix = {db_prefix}") logging.info(f"\tParam: group = {group}") - if species: - species = species.split(",") - if antispecies: - antispecies = antispecies.split(",") - if division: - division = division.split(",") - ensembl_release = str(ensembl_release) - # Fix registry url, if needed - match = re.search(r"^(.*)://(.*)", registry) - if match: - registry = match.group(2) - match = re.search(r"(.*)/(.*)", registry) - if match: - registry = match.group(1) + registry = self._fix_registry_url(registry) loaded_dbs = {} dbs = [] # Construct the db name pattern - name_pattern = f"%_{group}%" - if db_prefix: - db_prefix = f"{db_prefix}_" - else: - db_prefix = "" - name_pattern = f"{db_prefix}{name_pattern}" + db_prefix = f"{db_prefix}_" if db_prefix else "" + name_pattern = f"{db_prefix}%_{group}%" # Getting all dbs if run_all: - metasearch_body = { - "name_pattern": name_pattern, - "filters": [ - {"meta_key": "schema_version", "meta_value": ensembl_release}, - ], - "servers": [registry], - } - - # Query registry for all core dbs - dbs = requests.post(metasearch_url, json=metasearch_body).json() - dbs = dbs[registry] - + dbs = self._query_registry(metasearch_url, name_pattern, ensembl_release, registry) loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release) # Getting dbs for specified species - elif species and len(species) > 0: + elif species: for species_name in species: - name_pattern = f"{species_name}_core%" - name_pattern = f"{db_prefix}{name_pattern}" - - metasearch_body = { - "name_pattern": name_pattern, - "filters": [ - {"meta_key": "schema_version", "meta_value": ensembl_release}, - ], - "servers": [registry], - } - - # Query registry for species dbs - species_dbs = requests.post(metasearch_url, json=metasearch_body).json() - - if len(species_dbs[registry]) < 1: - raise IOError( - f"Database not found for {species_name}, check registry parameters" - ) - else: - dbs = dbs + species_dbs[registry] + species_pattern = f"{db_prefix}{species_name}_core%" + species_dbs = self._query_registry(metasearch_url, species_pattern, ensembl_release, registry) + if not species_dbs: + raise LookupError(f"Database not found for {species_name}, check registry parameters") + dbs.extend(species_dbs) loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release) # Check if all wanted species were found - for species_name in species: - if not loaded_dbs.get(species_name): - raise IOError( - f"Database not found for {species_name}, check registry parameters" - ) + self._check_species_found(species, loaded_dbs) # Getting dbs for specified divisions - elif division and len(division) > 0: + elif division: for div in division: - metasearch_body = { - "name_pattern": name_pattern, - "filters": [ - {"meta_key": "schema_version", "meta_value": ensembl_release}, - {"meta_key": "species.division", "meta_value": div}, - ], - "servers": [registry], - } - - # Query registry for dbs in division - div_dbs = requests.post(metasearch_url, json=metasearch_body).json() - dbs = dbs + div_dbs[registry] + div_dbs = self._query_registry(metasearch_url, name_pattern, ensembl_release, registry, div) + dbs.extend(div_dbs) loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release) - if len(loaded_dbs) == 0: - raise IOError(f"Could not find any matching dbs in registry {registry}") + # No species or division specified with run_all set to False + else: + raise ValueError("Must provide species or division with run_all set to False") + + if not loaded_dbs: + raise LookupError(f"Could not find any matching dbs in registry {registry}") if run_all: logging.info(f"All species in {len(loaded_dbs)} databases loaded") @@ -143,35 +94,59 @@ def run(self): # Write dataflow output for species_name, db in loaded_dbs.items(): if species_name not in antispecies: - self.write_output( - "species", {"species_name": species_name, "species_db": db} - ) + self.write_output("species", {"species_name": species_name, "species_db": db}) - def check_validity(self, dbs: List(str), prefix: str, group: str, release: str): + def _fix_registry_url(self, registry: str) -> str: + match = re.search(r"^(.*)://(.*)", registry) + if match: + registry = match.group(2) + match = re.search(r"(.*)/(.*)$", registry) + if match: + registry = match.group(1) + return registry + + def _query_registry(self, metasearch_url: str, name_pattern: str, ensembl_release: int, registry: str, division: str = None) -> List[str]: + ensembl_release_str = str(ensembl_release) + + filters = [ + { + "meta_key": "schema_version", + "meta_value": ensembl_release_str + } + ] + + if division: + filters.append({"meta_key": "species.division", "meta_value": division}) + + metasearch_body = { + "name_pattern": name_pattern, + "filters": filters, + "servers": [registry], + } + response = requests.post(metasearch_url, json=metasearch_body).json() + return response.get(registry, []) + + def _check_species_found(self, species_list: List[str], loaded_dbs: Dict[str, str]): + for species_name in species_list: + if species_name not in loaded_dbs: + raise LookupError(f"Database not found for {species_name}, check registry parameters") + + def check_validity(self, dbs: List[str], prefix: str, group: str, release: int) -> Dict[str, str]: valid_dbs = {} for db in dbs: # Extract db name - db_name = db - match = re.search(r"(.*)/(.*)", db_name) - if match: - db_name = match.group(2) + db_name = re.search(r"(.*)/(.*)$", db).group(2) if re.search(r"(.*)/(.*)$", db) else db # Check if db is valid - match = re.search( - r"^(%s)([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_%s(?:_\d+)?_%s_(\w+)$" - % (prefix, group, release), - db_name, - ) + match = re.search(rf"^{prefix}([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_{group}(?:_\d+)?_{release}_(\w+)$", db_name) if match: - species_name = match.group(2) - if not valid_dbs.get(species_name): + species_name = match.group(1) + if species_name not in valid_dbs: logging.info(f"Species {species_name} loaded") valid_dbs[species_name] = db else: - raise IOError( - f"Database {valid_dbs[species_name]} already loaded for species {species_name}, cannot load second database {db}" - ) + raise ValueError(f"Database {valid_dbs[species_name]} already loaded for species {species_name}, cannot load second database {db}") else: logging.info(f"Could not extract species name from database {db}") diff --git a/src/python/ensembl/production/xrefs/UniParcMapping.py b/src/python/ensembl/production/xrefs/UniParcMapping.py index 86668b621..07beaa923 100644 --- a/src/python/ensembl/production/xrefs/UniParcMapping.py +++ b/src/python/ensembl/production/xrefs/UniParcMapping.py @@ -14,27 +14,26 @@ """Xref module to process the Uniparc mappings.""" -from ensembl.production.xrefs.Base import * +import logging + +from ensembl.production.xrefs.Base import Base from ensembl.production.xrefs.mappers.UniParcMapper import UniParcMapper from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum - class UniParcMapping(Base): def run(self): - xref_db_url = self.param_required("xref_db_url", {"type": "str"}) - species_name = self.param_required("species_name", {"type": "str"}) - base_path = self.param_required("base_path", {"type": "str"}) - release = self.param_required("release", {"type": "int"}) - source_db_url = self.param_required("source_db_url", {"type": "str"}) - registry = self.param("registry_url", None, {"type": "str"}) - core_db_url = self.param("species_db", None, {"type": "str"}) + xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) + species_name: str = self.get_param("species_name", {"required": True, "type": str}) + base_path: str = self.get_param("base_path", {"required": True, "type": str}) + release: int = self.get_param("release", {"required": True, "type": int}) + source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) + registry: str = self.get_param("registry_url", {"type": str}) + core_db_url: str = self.get_param("species_db", {"type": str}) logging.info(f"UniParcMapping starting for species '{species_name}'") if not core_db_url: - core_db_url = self.get_db_from_registry( - species_name, "core", release, registry - ) + core_db_url = self.get_db_from_registry(species_name, "core", release, registry) # Get species id db_engine = self.get_db_engine(core_db_url) @@ -43,15 +42,13 @@ def run(self): # Get the uniparc mapper mapper = UniParcMapper( - self.get_xref_mapper( - xref_db_url, species_name, base_path, release, core_db_url, registry - ) + self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry) ) # Get source id db_engine = self.get_db_engine(source_db_url) with db_engine.connect() as source_dbi: - source_id = self.get_source_id_from_name(source_dbi, "UniParc") + source_id = self.get_source_id_from_name("UniParc", source_dbi) method = MySQLChecksum({"MAPPER": mapper}) results = method.run( diff --git a/src/python/ensembl/production/xrefs/mappers/BasicMapper.py b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py index 362eea354..bd563505d 100644 --- a/src/python/ensembl/production/xrefs/mappers/BasicMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py @@ -14,74 +14,27 @@ """Base module to handle xref mapping.""" -import re -import os -import sys -import warnings import logging -import subprocess -from sqlalchemy import select, insert, update, func, delete, desc, text +from sqlalchemy import select, insert, update, delete from sqlalchemy.engine import Engine, Connection -from sqlalchemy.orm import Session, sessionmaker, aliased -from sqlalchemy.sql.expression import case -from sqlalchemy.sql import Select -from typing import Dict, Any, List, Optional, Tuple - -from ensembl.core.models import ( - Gene as GeneORM, - Transcript as TranscriptORM, - Translation as TranslationORM, - Meta as MetaCORM, - AltAllele as AltAlleleCORM, - t_alt_allele_attrib as AltAlleleAttribORM, - ObjectXref as ObjectXrefCORM, - Xref as XrefCORM, - ExternalDb as ExternalDbORM, - UnmappedObject as UnmappedObjectORM, - UnmappedReason as UnmappedReasonORM, - Analysis as AnalysisORM, - OntologyXref as OntologyXrefORM, - ExternalSynonym as ExternalSynonymORM, - DependentXref as DependentXrefCORM, - IdentityXref as IdentityXrefCORM, - SeqRegionAttrib as SeqRegionAttribORM, - AttribType as AttribTypeORM, -) +from typing import Dict, Any, Optional +from datetime import datetime from ensembl.xrefs.xref_update_db_model import ( GeneTranscriptTranslation as GeneTranscriptTranslationORM, - GeneStableId as GeneStableIdORM, - TranscriptStableId as TranscriptStableIdORM, - TranslationStableId as TranslationStableIdORM, Meta as MetaUORM, ProcessStatus as ProcessStatusORM, ObjectXref as ObjectXrefUORM, - AltAllele as AltAlleleUORM, Source as SourceUORM, Xref as XrefUORM, IdentityXref as IdentityXrefUORM, - DependentXref as DependentXrefUORM, - GeneDirectXref as GeneDirectXrefORM, - TranscriptDirectXref as TranscriptDirectXrefORM, - TranslationDirectXref as TranslationDirectXrefORM, - Mapping as MappingORM, - MappingJobs as MappingJobsORM, - CoordinateXref as CoordinateXrefORM, - Synonym as SynonymORM, - Pairs as PairsORM, - PrimaryXref as PrimaryXrefORM, - DisplayXrefPriority as DisplayXrefPriorityORM, - GeneDescPriority as GeneDescPriorityORM, + DependentXref as DependentXrefUORM ) -from datetime import datetime - - class BasicMapper: - def __init__(self, args: Dict[str, Any] = None) -> None: - if args is None: - args = {} + def __init__(self, args: Optional[Dict[str, Any]] = None) -> None: + args = args or {} self._xref = args.get("xref") self._core = args.get("core") @@ -90,7 +43,7 @@ def __init__(self, args: Dict[str, Any] = None) -> None: self._log_file = args.get("log_file") self._species_dir = args.get("species_dir") - def xref(self, xref_db_engine: Engine = None) -> Engine: + def xref(self, xref_db_engine: Optional[Engine] = None) -> Engine: """Getter/Setter for the xref DB engine. Parameters @@ -100,14 +53,15 @@ def xref(self, xref_db_engine: Engine = None) -> Engine: Returns ------- - The xref DB engine. + Engine + The xref DB engine. """ if xref_db_engine: self._xref = xref_db_engine return self._xref - def core(self, core_db_engine: Engine = None) -> Engine: + def core(self, core_db_engine: Optional[Engine] = None) -> Engine: """Getter/Setter for the core DB engine. Parameters @@ -117,14 +71,15 @@ def core(self, core_db_engine: Engine = None) -> Engine: Returns ------- - The core DB engine. + Engine + The core DB engine. """ if core_db_engine: self._core = core_db_engine return self._core - def dna_file(self, dna_file: str = None) -> str: + def dna_file(self, dna_file: Optional[str] = None) -> str: """Getter/Setter for the dna file. Parameters @@ -134,14 +89,15 @@ def dna_file(self, dna_file: str = None) -> str: Returns ------- - The dna file path + Optional[str] + The dna file path """ - if dna_file: + if dna_file is not None: self._dna_file = dna_file return self._dna_file - def protein_file(self, protein_file: str = None) -> str: + def protein_file(self, protein_file: Optional[str] = None) -> str: """Getter/Setter for the protein file. Parameters @@ -151,14 +107,15 @@ def protein_file(self, protein_file: str = None) -> str: Returns ------- - The protein file path + Optional[str] + The protein file path """ - if protein_file: + if protein_file is not None: self._protein_file = protein_file return self._protein_file - def log_file(self, log_file: str = None) -> str: + def log_file(self, log_file: Optional[str] = None) -> str: """Getter/Setter for the log file. Parameters @@ -168,14 +125,15 @@ def log_file(self, log_file: str = None) -> str: Returns ------- - The log file path + Optional[str] + The log file path """ - if log_file: + if log_file is not None: self._log_file = log_file return self._log_file - def species_dir(self, species_dir: str = None) -> str: + def species_dir(self, species_dir: Optional[str] = None) -> str: """Getter/Setter for the species directory. Parameters @@ -185,14 +143,16 @@ def species_dir(self, species_dir: str = None) -> str: Returns ------- - The species directory + Optional[str] + The species directory """ - if species_dir: + if species_dir is not None: self._species_dir = species_dir return self._species_dir def official_name(self) -> None: + """Returns the official name.""" return None def add_meta_pair(self, meta_key: str, meta_value: str) -> None: @@ -205,7 +165,7 @@ def add_meta_pair(self, meta_key: str, meta_value: str) -> None: meta_value: str The value of the 'meta_value' column in the meta table """ - now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + now = datetime.now() with self.xref().connect() as dbi: dbi.execute( @@ -214,13 +174,18 @@ def add_meta_pair(self, meta_key: str, meta_value: str) -> None: ) ) - def get_meta_value(self, meta_key: str) -> str: + def get_meta_value(self, meta_key: str) -> Optional[str]: """Gets a value from the meta table based on key. Parameters ---------- meta_key: str The value of the 'meta_key' column in the meta table + + Returns + ------- + Optional[str] + The value of the 'meta_value' column if found, else None """ with self.xref().connect() as dbi: query = ( @@ -228,11 +193,9 @@ def get_meta_value(self, meta_key: str) -> str: .where(MetaUORM.meta_key == meta_key) .order_by(MetaUORM.meta_id.desc()) ) - value = dbi.execute(query).first() + result = dbi.execute(query).first() - if value: - value = value[0] - return value + return result[0] if result else None def update_process_status(self, status: str) -> None: """Adds a row to the process_status table. @@ -240,19 +203,23 @@ def update_process_status(self, status: str) -> None: Parameters ---------- status: str - The value of the 'status' column on the process_status table + The value of the 'status' column in the process_status table """ - now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + now = datetime.now() with self.xref().connect() as dbi: - dbi.execute(insert(ProcessStatusORM).values(status=status, date=now)) + dbi.execute( + insert(ProcessStatusORM).values(status=status, date=now) + ) def set_up_logging(self) -> None: + """Sets up logging for the mapper.""" log_file = self.log_file() + # Create handlers and set levels console_handler = logging.StreamHandler() - file_handler = logging.FileHandler(log_file, mode="a") console_handler.setLevel(logging.WARNING) + file_handler = logging.FileHandler(log_file, mode="a") file_handler.setLevel(logging.DEBUG) logging.basicConfig( @@ -263,9 +230,16 @@ def set_up_logging(self) -> None: ) def log_progress(self, message: str) -> None: + """Logs a message to the console and log file. + + Parameters + ---------- + message: str + The message to log + """ logging.info(message) - def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int: + def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: Optional[int] = None, status: Optional[str] = None) -> Optional[int]: """Retrieves the object_xref row ID from ensembl ID, xref ID, ensembl type, and linkage type. Parameters @@ -282,15 +256,14 @@ def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, l The xref ID of the xref that this object xref is dependent on status: str, optional The object xref status - dbi: sqlalchemy.engine.Connection + dbi: Connection The database connection to query in Returns ------- - The object xref ID, if found (else None). + Optional[int] + The object xref ID, if found (else None). """ - object_xref_id = None - query = select(ObjectXrefUORM.object_xref_id).where( ObjectXrefUORM.ensembl_id == ensembl_id, ObjectXrefUORM.xref_id == xref_id, @@ -302,14 +275,11 @@ def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, l if status is not None: query = query.where(ObjectXrefUORM.ox_status == status) - result = dbi.execute(query).fetchall() - - if result: - object_xref_id = result[0][0] + result = dbi.execute(query).scalar() - return object_xref_id + return result - def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int: + def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: Optional[int] = None, status: Optional[str] = None) -> int: """Adds data into object xref table in a database. Parameters @@ -326,21 +296,21 @@ def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, link The xref ID of the xref that this object xref is dependent on status: str, optional The object xref status - dbi: sqlalchemy.engine.Connection + dbi: Connection The database connection to query in Returns ------- - The inserted object xref ID. + int + The inserted object xref ID. """ query = insert(ObjectXrefUORM).values( ensembl_id=ensembl_id, xref_id=xref_id, ensembl_object_type=ensembl_type, linkage_type=linkage_type, + master_xref_id=master_xref_id ) - if master_xref_id is not None: - query = query.values(master_xref_id=master_xref_id) if status is not None: query = query.values(ox_status=status) dbi.execute(query) @@ -351,30 +321,38 @@ def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, link return object_xref_id def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> None: + """Fixes the biomart issue where a database is associated with both gene and transcript/translation object types. + + Parameters + ---------- + db_name: str + The database name + type1: str + The first object type (gene, transcript, or translation) + type2: str + The second object type (gene, transcript, or translation) + dbi: Connection + The database connection to query in + """ logging.info( f"{db_name} is associated with both {type1} and {type2} object types. Fixing." ) - # Figure out where to move xref to - to_type, from_type, to_id, from_id = None, None, None, None - if type1 == "Gene" or type2 == "Gene": + # Determine the types to move from and to + if "Gene" in (type1, type2): to_type = "Gene" - - if type1 == "Translation" or type2 == "Translation": - from_type = "Translation" - else: - from_type = "Transcript" + from_type = "Translation" if "Translation" in (type1, type2) else "Transcript" else: to_type = "Transcript" from_type = "Translation" logging.info(f"Moving all associations from {from_type} to {to_type}") - to_id = getattr(GeneTranscriptTranslationORM, to_type.lower() + "_id") - from_id = getattr(GeneTranscriptTranslationORM, from_type.lower() + "_id") + to_id = getattr(GeneTranscriptTranslationORM, f"{to_type.lower()}_id") + from_id = getattr(GeneTranscriptTranslationORM, f"{from_type.lower()}_id") # Move the object xref - query = ( + move_query = ( update(ObjectXrefUORM) .values(ensembl_object_type=to_type, ensembl_id=to_id) .where( @@ -387,10 +365,10 @@ def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> ) .prefix_with("IGNORE") ) - dbi.execute(query) + dbi.execute(move_query) # Delete moved object xref - query = ( + delete_query = ( select(ObjectXrefUORM.object_xref_id) .outerjoin( IdentityXrefUORM, @@ -404,7 +382,7 @@ def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> SourceUORM.name == db_name, ) ) - for row in dbi.execute(query).mappings().all(): + for row in dbi.execute(delete_query).mappings().all(): dbi.execute( delete(ObjectXrefUORM).where( ObjectXrefUORM.object_xref_id == row.object_xref_id @@ -418,12 +396,23 @@ def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> # Delete dependent xref sub_query = select(ObjectXrefUORM.object_xref_id) - query = delete(DependentXrefUORM).where( + dependent_delete_query = delete(DependentXrefUORM).where( DependentXrefUORM.object_xref_id.not_in(sub_query) ) - dbi.execute(query) + dbi.execute(dependent_delete_query) def update_object_xref_status(self, object_xref_id: int, status: str, dbi: Connection) -> None: + """Updates the status of an object xref. + + Parameters + ---------- + object_xref_id: int + The object xref ID to update + status: str + The new status + dbi: Connection + The database connection to query in + """ query = ( update(ObjectXrefUORM) .where(ObjectXrefUORM.object_xref_id == object_xref_id) diff --git a/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py index 535bb7ad6..08c194dc7 100644 --- a/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py @@ -14,8 +14,19 @@ """Mapper module for processing Checksum xref data.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +import logging +from typing import Any, Dict, List, Optional +from sqlalchemy.orm import sessionmaker +from sqlalchemy import delete, select +from sqlalchemy.engine import Connection +from ensembl.xrefs.xref_update_db_model import ( + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class ChecksumMapper(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -27,10 +38,9 @@ def __init__(self, mapper: BasicMapper) -> None: def target(self) -> None: return None - def mapper(self, mapper: BasicMapper = None): + def mapper(self, mapper: Optional[BasicMapper] = None) -> BasicMapper: if mapper: self._mapper = mapper - return self._mapper def upload(self, results: List[Dict[str, Any]], species_id: int) -> None: @@ -46,7 +56,7 @@ def upload(self, results: List[Dict[str, Any]], species_id: int) -> None: self._delete_entries("xref", source_id, xref_dbi) # Start session, in order to get inserted IDs - Session = sessionmaker(self.xref()) + Session = sessionmaker(bind=self.xref().execution_options(isolation_level="READ COMMITTED")) with Session.begin() as session: logging.info("Starting xref insertion") @@ -54,7 +64,7 @@ def upload(self, results: List[Dict[str, Any]], species_id: int) -> None: upi_xref_id = {} for row in results: upi = row["upi"] - if upi_xref_id.get(upi): + if upi in upi_xref_id: row["xref_id"] = upi_xref_id[upi] else: xref_object = XrefUORM( @@ -81,6 +91,8 @@ def upload(self, results: List[Dict[str, Any]], species_id: int) -> None: ) session.add(object_xref_object) + session.commit() + logging.info("Finished insertions") def source_id(self) -> int: diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py index d938d966c..6bf44f8bc 100644 --- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py @@ -14,14 +14,25 @@ """Mapper module for processing coordinate xref data.""" -from ensembl.production.xrefs.mappers.BasicMapper import * -from ensembl.common.RangeRegistry import RangeRegistry +import subprocess +import logging +from datetime import datetime +from sqlalchemy import select, func, update, insert + +from ensembl.core.models import ( + ObjectXref as ObjectXrefCORM, + Xref as XrefCORM, + UnmappedObject as UnmappedObjectORM, + UnmappedReason as UnmappedReasonORM, + Analysis as AnalysisORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper coding_weight = 2 ens_weight = 3 transcript_score_threshold = 0.75 - class CoordinateMapper(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: self.xref(mapper.xref()) @@ -33,98 +44,89 @@ def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir: self.update_process_status("coordinate_xrefs_started") # We only do coordinate mapping for mouse and human for now - if species_name != "mus_musculus" and species_name != "homo_sapiens": + if species_name not in ["mus_musculus", "homo_sapiens"]: self.update_process_status("coordinate_xref_finished") return output_dir = self.species_dir() - xref_filename = os.path.join(output_dir, "xref_coord.txt") - object_xref_filename = os.path.join(output_dir, "object_xref_coord.txt") - unmapped_reason_filename = os.path.join(output_dir, "unmapped_reason_coord.txt") - unmapped_object_filename = os.path.join(output_dir, "unmapped_object_coord.txt") - - xref_dbi = self.xref().connect() - core_dbi = self.core().connect() - - # Figure out the last used IDs in the core DB - xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() - object_xref_id = core_dbi.execute( - select(func.max(ObjectXrefCORM.object_xref_id)) - ).scalar() - unmapped_object_id = core_dbi.execute( - select(func.max(UnmappedObjectORM.unmapped_object_id)) - ).scalar() - unmapped_reason_id = core_dbi.execute( - select(func.max(UnmappedReasonORM.unmapped_reason_id)) - ).scalar() - - logging.info( - f"Last used xref_id={xref_id}, object_xref_id={object_xref_id}, unmapped_object_id={unmapped_object_id}, unmapped_reason_id={unmapped_reason_id}" - ) - - # Get an analysis ID - analysis_params = f"weights(coding,ensembl)={coding_weight:.2f},{ens_weight:.2f};transcript_score_threshold={transcript_score_threshold:.2f}" - analysis_id = core_dbi.execute( - select(AnalysisORM.analysis_id).where( - AnalysisORM.logic_name == "xrefcoordinatemapping", - AnalysisORM.parameters == analysis_params, + + with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi: + # Figure out the last used IDs in the core DB + xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() + object_xref_id = core_dbi.execute(select(func.max(ObjectXrefCORM.object_xref_id))).scalar() + unmapped_object_id = core_dbi.execute(select(func.max(UnmappedObjectORM.unmapped_object_id))).scalar() + unmapped_reason_id = core_dbi.execute(select(func.max(UnmappedReasonORM.unmapped_reason_id))).scalar() + + logging.info( + f"Last used xref_id={xref_id}, object_xref_id={object_xref_id}, unmapped_object_id={unmapped_object_id}, unmapped_reason_id={unmapped_reason_id}" ) - ).scalar() - if not analysis_id: + # Get an analysis ID + analysis_params = f"weights(coding,ensembl)={coding_weight:.2f},{ens_weight:.2f};transcript_score_threshold={transcript_score_threshold:.2f}" analysis_id = core_dbi.execute( select(AnalysisORM.analysis_id).where( - AnalysisORM.logic_name == "xrefcoordinatemapping" + AnalysisORM.logic_name == "xrefcoordinatemapping", + AnalysisORM.parameters == analysis_params, ) ).scalar() - if analysis_id: - logging.info("Will update 'analysis' table with new parameter settings") - - # Update an existing analysis - now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - core_dbi.execute( - update(AnalysisORM) - .where(AnalysisORM.analysis_id == analysis_id) - .values(created=now, parameters=analysis_params) - ) - else: - logging.info( - f"Cannot find analysis ID for this analysis: logic_name = 'xrefcoordinatemapping' parameters = {analysis_params}" - ) - - # Store a new analysis - logging.info("A new analysis will be added") - + if not analysis_id: analysis_id = core_dbi.execute( - select(func.max(AnalysisORM.analysis_id)) - ).scalar() - logging.info(f"Last used analysis_id is {analysis_id}") - - analysis_id += 1 - now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - core_dbi.execute( - insert(AnalysisORM).values( - analysis_id=analysis_id, - created=now, - logic_name="xrefcoordinatemapping", - program="CoordinateMapper.pm", - parameters=analysis_params, - module="CoordinateMapper.pm", + select(AnalysisORM.analysis_id).where( + AnalysisORM.logic_name == "xrefcoordinatemapping" ) - ) + ).scalar() - if analysis_id: - logging.info(f"Analysis ID is {analysis_id}") + if analysis_id: + logging.info("Will update 'analysis' table with new parameter settings") - logging.info(f"Running perl script {scripts_dir}/coordinmate_mapper.pl") - perl_cmd = f"perl {scripts_dir}/coordinmate_mapper.pl --xref_db_url '{self.xref()}' --core_db_url '{self.core()}' --species_id {species_id} --output_dir '{output_dir}' --analysis_id {analysis_id}" - cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE) + # Update an existing analysis + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + core_dbi.execute( + update(AnalysisORM) + .where(AnalysisORM.analysis_id == analysis_id) + .values(created=now, parameters=analysis_params) + ) + else: + logging.info( + f"Cannot find analysis ID for this analysis: logic_name = 'xrefcoordinatemapping' parameters = {analysis_params}" + ) - self.update_process_status("coordinate_xref_finished") + # Store a new analysis + logging.info("A new analysis will be added") + + analysis_id = core_dbi.execute(select(func.max(AnalysisORM.analysis_id))).scalar() + logging.info(f"Last used analysis_id is {analysis_id}") + + analysis_id += 1 + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + core_dbi.execute( + insert(AnalysisORM).values( + analysis_id=analysis_id, + created=now, + logic_name="xrefcoordinatemapping", + program="CoordinateMapper.pm", + parameters=analysis_params, + module="CoordinateMapper.pm", + ) + ) + + if analysis_id: + logging.info(f"Analysis ID is {analysis_id}") + + logging.info(f"Running perl script {scripts_dir}/coordinate_mapper.pl") + perl_cmd = [ + "perl", + f"{scripts_dir}/coordinate_mapper.pl", + "--xref_db_url", str(self.xref()), + "--core_db_url", str(self.core()), + "--species_id", str(species_id), + "--output_dir", output_dir, + "--analysis_id", str(analysis_id) + ] + subprocess.run(perl_cmd, capture_output=True, text=True, check=True) - self.biomart_fix("UCSC", "Translation", "Gene", xref_dbi) - self.biomart_fix("UCSC", "Transcript", "Gene", xref_dbi) + self.update_process_status("coordinate_xref_finished") - xref_dbi.close() - core_dbi.close() + self.biomart_fix("UCSC", "Translation", "Gene", xref_dbi) + self.biomart_fix("UCSC", "Transcript", "Gene", xref_dbi) diff --git a/src/python/ensembl/production/xrefs/mappers/CoreInfo.py b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py index eff41f4a2..7ea033aae 100644 --- a/src/python/ensembl/production/xrefs/mappers/CoreInfo.py +++ b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py @@ -14,8 +14,34 @@ """Mapper module for loading core data into an xref database.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from typing import Any, Dict, List, Optional +from sqlalchemy import select, insert, delete +from sqlalchemy.engine import Connection + +from ensembl.core.models import ( + Gene as GeneORM, + Transcript as TranscriptORM, + Translation as TranslationORM, + Meta as MetaCORM, + AltAllele as AltAlleleCORM, + t_alt_allele_attrib as AltAlleleAttribORM, + ObjectXref as ObjectXrefCORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + SeqRegionAttrib as SeqRegionAttribORM, + AttribType as AttribTypeORM +) + +from ensembl.xrefs.xref_update_db_model import ( + GeneTranscriptTranslation as GeneTranscriptTranslationORM, + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + TranslationStableId as TranslationStableIdORM, + AltAllele as AltAlleleUORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class CoreInfo(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -33,214 +59,192 @@ def get_core_data(self) -> None: self.update_process_status("core_data_loaded") def load_gene_transcript_translation(self) -> None: - xref_dbi = self.xref().connect() - core_dbi = self.core().connect() - - query = select( - TranscriptORM.gene_id, - TranscriptORM.transcript_id, - TranslationORM.translation_id, - ).outerjoin( - TranslationORM, TranscriptORM.transcript_id == TranslationORM.transcript_id - ) - for row in core_dbi.execute(query).mappings().all(): - xref_dbi.execute( - insert(GeneTranscriptTranslationORM) - .values( - gene_id=row.gene_id, - transcript_id=row.transcript_id, - translation_id=row.translation_id, - ) - .prefix_with("IGNORE") + with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi: + query = select( + TranscriptORM.gene_id, + TranscriptORM.transcript_id, + TranslationORM.translation_id, + ).outerjoin( + TranslationORM, TranscriptORM.transcript_id == TranslationORM.transcript_id ) - - xref_dbi.close() - core_dbi.close() + for row in core_dbi.execute(query).mappings().all(): + xref_dbi.execute( + insert(GeneTranscriptTranslationORM) + .values( + gene_id=row.gene_id, + transcript_id=row.transcript_id, + translation_id=row.translation_id, + ) + .prefix_with("IGNORE") + ) def load_stable_ids(self) -> None: - xref_dbi = self.xref().connect() - core_dbi = self.core().connect() - - core_tables = { - "gene": GeneORM, - "transcript": TranscriptORM, - "translation": TranslationORM, - } - xref_tables = { - "gene": GeneStableIdORM, - "transcript": TranscriptStableIdORM, - "translation": TranslationStableIdORM, - } - - for table in ["gene", "transcript", "translation"]: - column = getattr(core_tables[table], f"{table}_id") - core_query = select( - column.label("internal_id"), core_tables[table].stable_id - ) - if table == "transcript": - core_query = core_query.add_columns(TranscriptORM.biotype) - - count = 0 - for row in core_dbi.execute(core_query).mappings().all(): - xref_query = ( - insert(xref_tables[table]) - .values(internal_id=row.internal_id, stable_id=row.stable_id) - .prefix_with("IGNORE") + with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi: + core_tables = { + "gene": GeneORM, + "transcript": TranscriptORM, + "translation": TranslationORM, + } + xref_tables = { + "gene": GeneStableIdORM, + "transcript": TranscriptStableIdORM, + "translation": TranslationStableIdORM, + } + + for table in ["gene", "transcript", "translation"]: + column = getattr(core_tables[table], f"{table}_id") + core_query = select( + column.label("internal_id"), core_tables[table].stable_id ) if table == "transcript": - xref_query = xref_query.values(biotype=row.biotype) - xref_dbi.execute(xref_query) - - count += 1 + core_query = core_query.add_columns(TranscriptORM.biotype) + + count = 0 + for row in core_dbi.execute(core_query).mappings().all(): + xref_query = ( + insert(xref_tables[table]) + .values(internal_id=row.internal_id, stable_id=row.stable_id) + .prefix_with("IGNORE") + ) + if table == "transcript": + xref_query = xref_query.values(biotype=row.biotype) + xref_dbi.execute(xref_query) - logging.info(f"{count} {table}s loaded from core DB") + count += 1 - xref_dbi.close() - core_dbi.close() + logging.info(f"{count} {table}s loaded from core DB") def get_alt_alleles(self) -> None: - xref_dbi = self.xref().connect() - core_dbi = self.core().connect() - - alt_allele_list = self.fetch_all_alt_alleles(core_dbi) - - count = len(alt_allele_list) - alt_id_to_gene_id, gene_id_to_alt_id, is_reference = {}, {}, {} - max_alt_id = 0 - - if count > 0: - xref_dbi.execute(delete(AltAlleleUORM)) - - alt_added, num_of_genes = 0, 0 - - # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table - # Track the reference gene IDs - for group_id, group_members in alt_allele_list.items(): - ref_gene = self.rep_gene_id(group_members) - - # Representative gene not guaranteed, try to find an alternative best fit - if not ref_gene: - logging.info("Get alternative reference gene") - for gene_id in self.get_all_genes(group_members): - query = select(AttribTypeORM.code).where( - SeqRegionAttribORM.seq_region_id == GeneORM.seq_region_id, - AttribTypeORM.attrib_type_id - == SeqRegionAttribORM.attrib_type_id, - GeneORM.gene_id == gene_id, - AttribTypeORM.code == "non_ref", + with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi: + alt_allele_list = self.fetch_all_alt_alleles(core_dbi) + + count = len(alt_allele_list) + max_alt_id = 0 + + if count > 0: + xref_dbi.execute(delete(AltAlleleUORM)) + + alt_added, num_of_genes = 0, 0 + + # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table + # Track the reference gene IDs + for group_id, group_members in alt_allele_list.items(): + ref_gene = self.rep_gene_id(group_members) + + # Representative gene not guaranteed, try to find an alternative best fit + if not ref_gene: + logging.info("Get alternative reference gene") + for gene_id in self.get_all_genes(group_members): + query = select(AttribTypeORM.code).where( + SeqRegionAttribORM.seq_region_id == GeneORM.seq_region_id, + AttribTypeORM.attrib_type_id + == SeqRegionAttribORM.attrib_type_id, + GeneORM.gene_id == gene_id, + AttribTypeORM.code == "non_ref", + ) + result = core_dbi.execute(query) + if result.rowcount > 0: + continue + else: + ref_gene = gene_id + break + + if not ref_gene: + logging.warning( + f"Tried very hard but failed to select a representative gene for alt-allele-group {group_id}" ) - result = core_dbi.execute(query) - if result.rowcount > 0: - continue - else: - ref_gene = gene_id - break - - if not ref_gene: - logging.warning( - f"Tried very hard but failed to select a representative gene for alt-allele-group {group_id}" - ) - continue + continue - is_reference[ref_gene] = 1 - others = [] - for member in group_members: - if member[0] != ref_gene: - others.append(member[0]) + others = [member[0] for member in group_members if member[0] != ref_gene] - xref_dbi.execute( - insert(AltAlleleUORM).values( - alt_allele_id=group_id, gene_id=ref_gene, is_reference=1 - ) - ) - num_of_genes += 1 - alt_added += 1 - for gene_id in others: xref_dbi.execute( insert(AltAlleleUORM).values( - alt_allele_id=group_id, gene_id=gene_id, is_reference=0 + alt_allele_id=group_id, gene_id=ref_gene, is_reference=1 ) ) num_of_genes += 1 + alt_added += 1 + for gene_id in others: + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=group_id, gene_id=gene_id, is_reference=0 + ) + ) + num_of_genes += 1 - if group_id > max_alt_id: - max_alt_id = group_id - - logging.info(f"{alt_added} alleles found containing {num_of_genes} genes") - else: - logging.info("No alt alleles found for this species") + if group_id > max_alt_id: + max_alt_id = group_id - # LRGs added as alt_alleles in the XREF system but never added to core - count = 0 - old_count, new_count, lrg_count = 0, 0, 0 + logging.info(f"{alt_added} alleles found containing {num_of_genes} genes") + else: + logging.info("No alt alleles found for this species") - query = ( - select(ObjectXrefCORM.ensembl_id, GeneORM.gene_id) - .where( - XrefCORM.xref_id == ObjectXrefCORM.xref_id, - ExternalDbORM.external_db_id == XrefCORM.external_db_id, - ObjectXrefCORM.ensembl_object_type == "Gene", - XrefCORM.display_label == GeneORM.stable_id, - ) - .filter(ExternalDbORM.db_name.like("Ens_Hs_gene")) - ) - for row in core_dbi.execute(query).mappings().all(): - # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only - # Else use a new one and add both core and LRG - group_id = self.fetch_group_id_by_gene_id(row.gene_id, core_dbi) - if group_id: - xref_dbi.execute( - insert(AltAlleleUORM).values( - alt_allele_id=group_id, gene_id=row.ensembl_id, is_reference=0 - ) + # LRGs added as alt_alleles in the XREF system but never added to core + count = 0 + old_count, new_count, lrg_count = 0, 0, 0 + + query = ( + select(ObjectXrefCORM.ensembl_id, GeneORM.gene_id) + .where( + XrefCORM.xref_id == ObjectXrefCORM.xref_id, + ExternalDbORM.external_db_id == XrefCORM.external_db_id, + ObjectXrefCORM.ensembl_object_type == "Gene", + XrefCORM.display_label == GeneORM.stable_id, ) - old_count += 1 - else: - group_id = self.fetch_group_id_by_gene_id(row.ensembl_id, core_dbi) + .filter(ExternalDbORM.db_name.like("Ens_Hs_gene")) + ) + for row in core_dbi.execute(query).mappings().all(): + # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only + # Else use a new one and add both core and LRG + group_id = self.fetch_group_id_by_gene_id(row.gene_id, core_dbi) if group_id: xref_dbi.execute( insert(AltAlleleUORM).values( - alt_allele_id=group_id, - gene_id=row.ensembl_id, - is_reference=1, + alt_allele_id=group_id, gene_id=row.ensembl_id, is_reference=0 ) ) - lrg_count += 1 - logging.info(f"LRG peculiarity\t{row.gene_id}\t{row.ensembl_id}") + old_count += 1 else: - max_alt_id += 1 - xref_dbi.execute( - insert(AltAlleleUORM).values( - alt_allele_id=max_alt_id, - gene_id=row.ensembl_id, - is_reference=0, + group_id = self.fetch_group_id_by_gene_id(row.ensembl_id, core_dbi) + if group_id: + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=group_id, + gene_id=row.ensembl_id, + is_reference=1, + ) ) - ) - xref_dbi.execute( - insert(AltAlleleUORM).values( - alt_allele_id=max_alt_id, - gene_id=row.gene_id, - is_reference=1, + lrg_count += 1 + logging.info(f"LRG peculiarity\t{row.gene_id}\t{row.ensembl_id}") + else: + max_alt_id += 1 + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=max_alt_id, + gene_id=row.ensembl_id, + is_reference=0, + ) ) - ) - new_count += 1 - count += 1 - - if count: - logging.info( - f"Added {count} alt_alleles for the LRGs. {old_count} added to previous alt_alleles and {new_count} new ones" - ) - logging.info(f"LRG problem count = {lrg_count}") + xref_dbi.execute( + insert(AltAlleleUORM).values( + alt_allele_id=max_alt_id, + gene_id=row.gene_id, + is_reference=1, + ) + ) + new_count += 1 + count += 1 - xref_dbi.close() - core_dbi.close() + if count: + logging.info( + f"Added {count} alt_alleles for the LRGs. {old_count} added to previous alt_alleles and {new_count} new ones" + ) + logging.info(f"LRG problem count = {lrg_count}") - self.update_process_status("alt_alleles_added") + self.update_process_status("alt_alleles_added") def fetch_all_alt_alleles(self, dbi: Connection) -> Dict[int, List[List[Any]]]: group_list = {} - query = None - if self.is_multispecies(dbi): ##### TO DO: handle multiespecies raise NotImplementedError(f"Pipeline cannot handle multispecies DBs yet") @@ -281,7 +285,7 @@ def fetch_group_id_by_gene_id(self, gene_id: int, dbi: Connection) -> Optional[i ) group_list = dbi.execute(query).mappings().all() - if len(group_list) > 0: + if group_list: return group_list[0].alt_allele_group_id return None @@ -293,10 +297,7 @@ def is_multispecies(self, dbi: Connection) -> bool: ) ) - if result.rowcount > 1: - return True - else: - return False + return result.rowcount > 1 def rep_gene_id(self, group: List[List[Any]]) -> Optional[int]: for allele in group: @@ -312,9 +313,4 @@ def rep_gene_id(self, group: List[List[Any]]) -> Optional[int]: return None def get_all_genes(self, group: List[List[Any]]) -> List[int]: - gene_ids = [] - - for allele in group: - gene_ids.append(allele[0]) - - return sorted(gene_ids) + return sorted(allele[0] for allele in group) diff --git a/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py index c3113dee3..a05028aca 100644 --- a/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py @@ -14,8 +14,26 @@ """Mapper module for processing direct xref data.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +import re +from typing import Any, Dict +from sqlalchemy import select, insert +from sqlalchemy.engine import Connection + +from ensembl.xrefs.xref_update_db_model import ( + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + TranslationStableId as TranslationStableIdORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + GeneDirectXref as GeneDirectXrefORM, + TranscriptDirectXref as TranscriptDirectXrefORM, + TranslationDirectXref as TranslationDirectXrefORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class DirectXrefsMapper(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -41,9 +59,9 @@ def process(self) -> None: err_count = {} object_xref_id = 0 - for table in ["gene", "transcript", "translation"]: - direct_table = db_tables[table]["direct"] - stable_id_table = db_tables[table]["stable_id"] + for object_type, tables in db_tables.items(): + direct_table = tables["direct"] + stable_id_table = tables["stable_id"] count, duplicate_direct_count, duplicate_dependent_count = 0, 0, 0 @@ -89,14 +107,14 @@ def process(self) -> None: # Insert into object xref table object_xref_id = self.get_object_xref_id( - internal_id, xref_id, table, "DIRECT", xref_dbi + internal_id, xref_id, object_type, "DIRECT", xref_dbi ) if object_xref_id: duplicate_direct_count += 1 continue else: object_xref_id = self.add_object_xref( - internal_id, xref_id, table, "DIRECT", xref_dbi + internal_id, xref_id, object_type, "DIRECT", xref_dbi ) # Insert into identity xref table @@ -113,7 +131,7 @@ def process(self) -> None: { "master_xrefs": master_xref_ids, "dup_count": duplicate_dependent_count, - "table": table, + "table": object_type, "internal_id": internal_id, }, xref_dbi, @@ -121,7 +139,7 @@ def process(self) -> None: if duplicate_direct_count or duplicate_dependent_count: logging.info( - f"Duplicate entries ignored for {duplicate_direct_count} direct xrefs and {duplicate_dependent_count} dependent xrefs" + f"Duplicate entries ignored for {duplicate_direct_count} direct xrefs and {duplicate_dependent_count} dependent xrefs" ) for key, val in err_count.items(): diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py index a2a543589..22b6f61b7 100644 --- a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py +++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py @@ -14,8 +14,39 @@ """Mapper module for setting display xrefs in the core DB.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +import re +from datetime import datetime +from typing import Dict, List, Tuple +from sqlalchemy import select, insert, update, delete, case, desc, func, aliased +from sqlalchemy.engine import Connection +from sqlalchemy.sql import Select + +from ensembl.core.models import ( + Gene as GeneORM, + Transcript as TranscriptORM, + Translation as TranslationORM, + Meta as MetaCORM, + ObjectXref as ObjectXrefCORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + ExternalSynonym as ExternalSynonymORM +) + +from ensembl.xrefs.xref_update_db_model import ( + GeneTranscriptTranslation as GeneTranscriptTranslationORM, + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + DisplayXrefPriority as DisplayXrefPriorityORM, + GeneDescPriority as GeneDescPriorityORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class DisplayXrefs(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -36,16 +67,14 @@ def build_display_xrefs(self) -> None: mapper = self.mapper() # Set the display xrefs + set_transcript_display_xrefs = hasattr(mapper, "set_transcript_names") if hasattr(mapper, "set_display_xrefs"): mapper.set_display_xrefs() else: - set_transcript_display_xrefs = False - if hasattr(mapper, "set_transcript_names"): - set_transcript_display_xrefs = True self.set_display_xrefs(set_transcript_display_xrefs) # Set transcript names - if hasattr(mapper, "set_transcript_names"): + if set_transcript_display_xrefs: mapper.set_transcript_names() else: self.set_transcript_names() @@ -64,47 +93,33 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: logging.info("Setting Transcript and Gene display xrefs") # Get the xref offset used when adding the xrefs into the core DB - xref_offset = self.get_meta_value("xref_offset") - xref_offset = int(xref_offset) + xref_offset = int(self.get_meta_value("xref_offset")) logging.info(f"Using xref offset of {xref_offset}") xref_dbi = self.xref().connect() core_dbi = self.core().connect() mapper = self.mapper() - # Reset transcript display xrefs + # Reset transcript display xrefs if required if set_transcript_display_xrefs: core_dbi.execute( update(TranscriptORM) .values(display_xref_id=None) - .where(TranslationORM.biotype != "LRG_gene") + .where(TranscriptORM.biotype != "LRG_gene") ) for object_type in ["Gene", "Transcript"]: if object_type == "Transcript" and not set_transcript_display_xrefs: continue - precedence_list, ignore = None, None # Get name source priorities and ignore queries method = f"{object_type.lower()}_display_xref_sources" - if hasattr(mapper, method): - precedence_list, ignore = getattr(mapper, method)() - else: - precedence_list, ignore = getattr(self, method)() + precedence_list, ignore = getattr(mapper, method)() if hasattr(mapper, method) else getattr(self, method)() # Add the priorities into the DB - priority = 0 logging.info(f"Precedence for {object_type} display xrefs (1- best name)") - - for source_name in precedence_list: - priority += 1 - - # Get the source ID - query = ( - select(SourceUORM.source_id, SourceUORM.name) - .where(SourceUORM.name.like(source_name)) - .order_by(SourceUORM.priority) - ) + for priority, source_name in enumerate(precedence_list, start=1): + query = select(SourceUORM.source_id, SourceUORM.name).where(SourceUORM.name.like(source_name)) for row in xref_dbi.execute(query).mappings().all(): xref_dbi.execute( insert(DisplayXrefPriorityORM).values( @@ -113,8 +128,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: priority=priority, ) ) - - logging.info(f"{priority} - {row.name}") + logging.info(f"{priority} - {row.name}") # Execute ignore queries self._apply_ignore(ignore, xref_dbi) @@ -129,34 +143,19 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: gene_case_stmt = case( [ (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id), - ( - ObjectXrefUORM.ensembl_object_type == "Transcript", - GTTTranscript.gene_id, - ), - ( - ObjectXrefUORM.ensembl_object_type == "Translation", - GTTTranslation.gene_id, - ), + (ObjectXrefUORM.ensembl_object_type == "Transcript", GTTTranscript.gene_id), + (ObjectXrefUORM.ensembl_object_type == "Translation", GTTTranslation.gene_id), ], ).label("d_gene_id") transcript_case_stmt = case( [ - ( - ObjectXrefUORM.ensembl_object_type == "Gene", - GTTGene.transcript_id, - ), - ( - ObjectXrefUORM.ensembl_object_type == "Transcript", - GTTTranscript.transcript_id, - ), - ( - ObjectXrefUORM.ensembl_object_type == "Translation", - GTTTranslation.transcript_id, - ), + (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.transcript_id), + (ObjectXrefUORM.ensembl_object_type == "Transcript", GTTTranscript.transcript_id), + (ObjectXrefUORM.ensembl_object_type == "Translation", GTTTranslation.transcript_id), ], ).label("d_transcript_id") - # Get all relevent xrefs for this object type based on precendence sources + # Get all relevant xrefs for this object type based on precedence sources query = ( select( gene_case_stmt, @@ -164,24 +163,13 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: DisplayXrefPriorityORM.priority, XrefUORM.xref_id, ) - .join( - SourceUORM, SourceUORM.source_id == DisplayXrefPriorityORM.source_id - ) + .join(SourceUORM, SourceUORM.source_id == DisplayXrefPriorityORM.source_id) .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id) - .join( - IdentityXrefUORM, - IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, - ) + .join(IdentityXrefUORM, IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id) .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id) - .outerjoin( - GTTTranscript, - GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id, - ) - .outerjoin( - GTTTranslation, - GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id, - ) + .outerjoin(GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id) + .outerjoin(GTTTranslation, GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id) .where( ObjectXrefUORM.ox_status == "DUMP_OUT", DisplayXrefPriorityORM.ensembl_object_type == object_type, @@ -190,32 +178,22 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: "d_gene_id", ObjectXrefUORM.ensembl_object_type, DisplayXrefPriorityORM.priority, - desc( - IdentityXrefUORM.target_identity - + IdentityXrefUORM.query_identity - ), + desc(IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity), ObjectXrefUORM.unused_priority.desc(), XrefUORM.accession, ) ) for row in xref_dbi.execute(query).mappings().all(): - object_id = None - if object_type == "Gene": - object_id = row.d_gene_id - elif object_type == "Transcript": - object_id = row.d_transcript_id + object_id = row.d_gene_id if object_type == "Gene" else row.d_transcript_id # Update the display xrefs - if not object_seen.get(object_id): + if object_id not in object_seen: xref_id = int(row.xref_id) if object_type == "Gene": core_dbi.execute( update(GeneORM) .values(display_xref_id=xref_id + xref_offset) - .where( - GeneORM.gene_id == object_id, - GeneORM.display_xref_id == None, - ) + .where(GeneORM.gene_id == object_id, GeneORM.display_xref_id == None) ) elif object_type == "Transcript": core_dbi.execute( @@ -225,7 +203,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: ) display_xref_count += 1 - object_seen[object_id] = 1 + object_seen[object_id] = True logging.info(f"Updated {display_xref_count} {object_type} display_xrefs") @@ -242,8 +220,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id) .where(GeneORM.display_xref_id == None) ) - result = core_dbi.execute(query).fetchall() - xref_ids = [row[0] for row in result] + xref_ids = [row[0] for row in core_dbi.execute(query).fetchall()] core_dbi.execute( delete(ExternalSynonymORM).where(ExternalSynonymORM.xref_id.in_(xref_ids)) @@ -286,6 +263,7 @@ def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: ) ignore_queries["EntrezGene"] = query + # Ignore LOC-prefixed labels query = ( select(ObjectXrefUORM.object_xref_id) .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id) @@ -304,22 +282,23 @@ def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]] def _apply_ignore(self, ignore_queries: Dict[str, Select], dbi: Connection) -> None: # Set status to NO_DISPLAY for object_xrefs with a display_label that is just numeric - query = ( + numeric_label_query = ( update(ObjectXrefUORM) .values(ox_status="NO_DISPLAY") .where( ObjectXrefUORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id == SourceUORM.source_id, - ObjectXrefUORM.ox_status.like("DUMP_OUT"), + ObjectXrefUORM.ox_status == "DUMP_OUT", XrefUORM.label.regexp_match("^[0-9]+$"), ) ) - dbi.execute(query) + dbi.execute(numeric_label_query) # Go through ignore queries for ignore_type, ignore_query in ignore_queries.items(): # Set status to NO_DISPLAY for ignore results - for row in dbi.execute(ignore_query).mappings().all(): + ignore_results = dbi.execute(ignore_query).mappings().all() + for row in ignore_results: dbi.execute( update(ObjectXrefUORM) .values(ox_status="NO_DISPLAY") @@ -339,12 +318,8 @@ def set_transcript_names(self) -> None: ) # Get the max xref and object_xref IDs - xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() - xref_id = int(xref_id) - object_xref_id = core_dbi.execute( - select(func.max(ObjectXrefCORM.object_xref_id)) - ).scalar() - object_xref_id = int(object_xref_id) + xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() or 0 + object_xref_id = core_dbi.execute(select(func.max(ObjectXrefCORM.object_xref_id))).scalar() or 0 # Get all genes with set display_xref_id query = select( @@ -373,12 +348,12 @@ def set_transcript_names(self) -> None: ) # Get transcripts related to current gene - query = ( + transcript_query = ( select(TranscriptORM.transcript_id) .where(TranscriptORM.gene_id == row.gene_id) .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end) ) - for transcript_row in core_dbi.execute(query).mappings().all(): + for transcript_row in core_dbi.execute(transcript_query).mappings().all(): object_xref_id += 1 display_label = f"{row.display_label}-{ext}" @@ -424,7 +399,7 @@ def set_transcript_names(self) -> None: ) ) - # Set transcript dispay xref + # Set transcript display xref core_dbi.execute( update(TranscriptORM) .values(display_xref_id=insert_xref_id) @@ -434,13 +409,12 @@ def set_transcript_names(self) -> None: ext += 1 # Delete object xrefs with no matching xref - query = ( + delete_query = ( select(ObjectXrefCORM.object_xref_id) .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id) .where(XrefCORM.xref_id == None) ) - result = core_dbi.execute(query).fetchall() - object_xref_ids = [row[0] for row in result] + object_xref_ids = [row[0] for row in core_dbi.execute(delete_query).fetchall()] core_dbi.execute( delete(ObjectXrefCORM).where( @@ -460,61 +434,41 @@ def set_gene_descriptions(self) -> None: # Reset the gene descriptions core_dbi.execute(update(GeneORM).values(description=None)) - # Get external display names - name_to_external_name = {} - query = select( - ExternalDbORM.external_db_id, - ExternalDbORM.db_name, - ExternalDbORM.db_display_name, - ) - for row in core_dbi.execute(query).mappings().all(): - name_to_external_name[row.db_name] = row.db_display_name - # Get source ID to external names mappings - if hasattr(mapper, "set_source_id_to_external_name"): - source_id_to_external_name, name_to_source_id = ( - mapper.set_source_id_to_external_name(name_to_external_name, xref_dbi) - ) - else: - source_id_to_external_name, name_to_source_id = ( - self.set_source_id_to_external_name(name_to_external_name, xref_dbi) - ) + source_id_to_external_name, name_to_source_id = self.get_external_name_mappings(core_dbi, xref_dbi) # Get description source priorities and ignore queries - if hasattr(mapper, "gene_description_sources"): - precedence_list = mapper.gene_description_sources() - ignore = None - else: - precedence_list, ignore = self.gene_description_sources() + precedence_list, ignore = ( + mapper.gene_description_sources() + if hasattr(mapper, "gene_description_sources") + else self.gene_description_sources() + ) # Get description regular expressions - if hasattr(mapper, "gene_description_filter_regexps"): - reg_exps = mapper.gene_description_filter_regexps() - else: - reg_exps = self.gene_description_filter_regexps() + reg_exps = ( + mapper.gene_description_filter_regexps() + if hasattr(mapper, "gene_description_filter_regexps") + else self.gene_description_filter_regexps() + ) # Add the description priorities into the DB - priority = 0 logging.info("Precedence for Gene descriptions (1- best description)") - - for source_name in precedence_list: - priority += 1 - - # Get the source ID - query = select(SourceUORM.source_id, SourceUORM.name).where( - SourceUORM.name.like(source_name) - ) - for row in xref_dbi.execute(query).mappings().all(): + for priority, source_name in enumerate(precedence_list, start=1): + for row in xref_dbi.execute( + select(SourceUORM.source_id, SourceUORM.name).where( + SourceUORM.name.like(source_name) + ) + ).mappings().all(): xref_dbi.execute( insert(GeneDescPriorityORM) .values(source_id=row.source_id, priority=priority) .prefix_with("IGNORE") ) - - logging.info(f"{priority} - {row.name}") + logging.info(f"{priority} - {row.name}") # Execute ignore queries - self._apply_ignore(ignore, xref_dbi) + if ignore: + self._apply_ignore(ignore, xref_dbi) no_source_name_in_desc = {} if hasattr(mapper, "no_source_label_list"): @@ -522,9 +476,9 @@ def set_gene_descriptions(self) -> None: source_id = name_to_source_id.get(source_name) if source_id: logging.info( - f"Source '{name}' will not have [Source:...] info in description" + f"Source '{source_name}' will not have [Source:...] info in description" ) - no_source_name_in_desc[source_id] = 1 + no_source_name_in_desc[source_id] = True gene_desc_updated = {} @@ -535,18 +489,12 @@ def set_gene_descriptions(self) -> None: gene_case_stmt = case( [ (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id), - ( - ObjectXrefUORM.ensembl_object_type == "Transcript", - GTTTranscript.gene_id, - ), - ( - ObjectXrefUORM.ensembl_object_type == "Translation", - GTTTranslation.gene_id, - ), + (ObjectXrefUORM.ensembl_object_type == "Transcript", GTTTranscript.gene_id), + (ObjectXrefUORM.ensembl_object_type == "Translation", GTTTranslation.gene_id), ], ).label("d_gene_id") - # Get all relevent xrefs for this object type based on precendence sources + # Get all relevant xrefs for this object type based on precedence sources query = ( select( gene_case_stmt, @@ -558,53 +506,40 @@ def set_gene_descriptions(self) -> None: .join(SourceUORM, SourceUORM.source_id == GeneDescPriorityORM.source_id) .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id) - .join( - IdentityXrefUORM, - IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, - ) + .join(IdentityXrefUORM, IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id) .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id) - .outerjoin( - GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id - ) - .outerjoin( - GTTTranslation, - GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id, - ) + .outerjoin(GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id) + .outerjoin(GTTTranslation, GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id) .where(ObjectXrefUORM.ox_status == "DUMP_OUT") .order_by( "d_gene_id", ObjectXrefUORM.ensembl_object_type, GeneDescPriorityORM.priority, - desc( - IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity - ), + desc(IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity), ) ) for row in xref_dbi.execute(query).mappings().all(): - if gene_desc_updated.get(row.d_gene_id): + if row.d_gene_id in gene_desc_updated: continue if row.description: # Apply regular expressions to description filtered_description = self.filter_by_regexp(row.description, reg_exps) - if filtered_description != "": - source_name = source_id_to_external_name.get(row.source_id) - filtered_description += ( - f" [Source:{source_name};Acc:{row.accession}]" - ) + if filtered_description: + if row.source_id not in no_source_name_in_desc: + source_name = source_id_to_external_name.get(row.source_id) + filtered_description += f" [Source:{source_name};Acc:{row.accession}]" - # Update the gene description - core_dbi.execute( - update(GeneORM) - .values(description=filtered_description) - .where( - GeneORM.gene_id == row.d_gene_id, GeneORM.description == None + # Update the gene description + core_dbi.execute( + update(GeneORM) + .values(description=filtered_description) + .where(GeneORM.gene_id == row.d_gene_id, GeneORM.description == None) ) - ) - gene_desc_updated[row.d_gene_id] = 1 + gene_desc_updated[row.d_gene_id] = True - logging.info(f"{len(gene_desc_updated.keys())} gene descriptions added") + logging.info(f"{len(gene_desc_updated)} gene descriptions added") # Reset ignored object xrefs xref_dbi.execute( @@ -618,14 +553,16 @@ def set_gene_descriptions(self) -> None: def get_external_name_mappings(self, core_dbi: Connection, xref_dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]: # Get external display names - external_name_to_display_name = {} - query = select( - ExternalDbORM.external_db_id, - ExternalDbORM.db_name, - ExternalDbORM.db_display_name, - ) - for row in core_dbi.execute(query).mappings().all(): - external_name_to_display_name[row.db_name] = row.db_display_name + external_name_to_display_name = { + row.db_name: row.db_display_name + for row in core_dbi.execute( + select( + ExternalDbORM.external_db_id, + ExternalDbORM.db_name, + ExternalDbORM.db_display_name, + ) + ).mappings().all() + } # Get sources for available xrefs source_id_to_external_name, source_name_to_source_id = {}, {} @@ -645,26 +582,6 @@ def get_external_name_mappings(self, core_dbi: Connection, xref_dbi: Connection) return source_id_to_external_name, source_name_to_source_id - def set_source_id_to_external_name(self, name_to_external_name: Dict[str, str], dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]: - source_id_to_external_name, name_to_source_id = {}, {} - - # Get sources for available xrefs - query = ( - select(SourceUORM.source_id, SourceUORM.name) - .where(SourceUORM.source_id == XrefUORM.source_id) - .group_by(SourceUORM.source_id) - ) - for row in dbi.execute(query).mappings().all(): - if name_to_external_name.get(row.name): - source_id_to_external_name[row.source_id] = name_to_external_name[row.name] - name_to_source_id[row.name] = row.source_id - elif re.search(r"notransfer$", row.name): - logging.info(f"Ignoring notransfer source '{row.name}'") - else: - raise LookupError(f"Could not find {row.name} in external_db table") - - return source_id_to_external_name, name_to_source_id - def gene_description_sources(self) -> Tuple[List[str], Dict[str, Select]]: return self.gene_display_xref_sources() @@ -746,20 +663,25 @@ def filter_by_regexp(self, string: str, regular_expressions: List[str]) -> str: return string def set_meta_timestamp(self) -> None: + logging.info("Setting meta timestamp for xrefs") + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + with self.core().connect() as dbi: + # Delete existing xref timestamp dbi.execute(delete(MetaCORM).where(MetaCORM.meta_key == "xref.timestamp")) - now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + # Insert new xref timestamp dbi.execute( insert(MetaCORM).values(meta_key="xref.timestamp", meta_value=now) ) + logging.info(f"Meta timestamp set to {now}") + def set_display_xrefs_from_stable_table(self) -> None: logging.info("Setting Transcript and Gene display xrefs using stable IDs") # Get the xref offset used when adding the xrefs into the core DB - xref_offset = self.get_meta_value("xref_offset") - xref_offset = int(xref_offset) + xref_offset = int(self.get_meta_value("xref_offset")) logging.info(f"Using xref offset of {xref_offset}") xref_dbi = self.xref().connect() @@ -776,26 +698,8 @@ def set_display_xrefs_from_stable_table(self) -> None: .where(GeneORM.description.like("%[Source:%]%")) ) - # Get external names and IDs - name_to_external_name, source_id_to_external_name = {}, {} - query = select( - ExternalDbORM.external_db_id, - ExternalDbORM.db_name, - ExternalDbORM.db_display_name, - ) - for row in core_dbi.execute(query).mappings().all(): - name_to_external_name[row.db_name] = row.db_display_name - - query = ( - select(SourceUORM.source_id, SourceUORM.name) - .where(SourceUORM.source_id == XrefUORM.source_id) - .group_by(SourceUORM.source_id) - ) - for row in xref_dbi.execute(query).mappings().all(): - if name_to_external_name.get(row.name): - source_id_to_external_name[row.source_id] = name_to_external_name[ - row.name - ] + # Get source ID to external names mappings + source_id_to_external_name, name_to_source_id = self.get_external_name_mappings(core_dbi, xref_dbi) gene_count = 0 @@ -818,7 +722,7 @@ def set_display_xrefs_from_stable_table(self) -> None: ) # Set description - if row.description is not None and row.description != "": + if row.description: description = f"{row.description} [Source:{source_id_to_external_name[row.source_id]};Acc:{row.accession}]" core_dbi.execute( update(GeneORM) diff --git a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py index e4c33bf75..82768724e 100644 --- a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py +++ b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py @@ -14,8 +14,25 @@ """Mapper module for setting the feature names.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +import re +from typing import Any, Dict, Tuple, List +from sqlalchemy import select, func, update, case, desc, insert, aliased, delete +from sqlalchemy.engine import Connection + +from ensembl.xrefs.xref_update_db_model import ( + GeneTranscriptTranslation as GeneTranscriptTranslationORM, + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + Synonym as SynonymORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class OfficialNaming(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -33,7 +50,7 @@ def official_name(self, official_name: str = None) -> str: def run(self, species_id: int, verbose: bool) -> None: logging.info("Starting official naming") - # If no offical name then we do not want to go any further + # If no official name then we do not want to go any further dbname = self.official_name() if not dbname: self.update_process_status("official_naming_done") @@ -42,30 +59,24 @@ def run(self, species_id: int, verbose: bool) -> None: xref_dbi = self.xref().connect() # If there are any official names on transcripts or translations, move them onto gene level - if dbname == "MGI": - self.biomart_fix("MGI", "Translation", "Gene", xref_dbi) - self.biomart_fix("MGI", "Transcript", "Gene", xref_dbi) - if dbname == "ZFIN_ID": - self.biomart_fix("ZFIN_ID", "Translation", "Gene", xref_dbi) - self.biomart_fix("ZFIN_ID", "Transcript", "Gene", xref_dbi) - if dbname == "RGD": - self.biomart_fix("RGD", "Translation", "Gene", xref_dbi) - self.biomart_fix("RGD", "Transcript", "Gene", xref_dbi) + for name in ["MGI", "ZFIN_ID", "RGD"]: + if dbname == name: + self.biomart_fix(name, "Translation", "Gene", xref_dbi) + self.biomart_fix(name, "Transcript", "Gene", xref_dbi) # Get the current max values for xref and object_xref - max_xref_id = xref_dbi.execute(select(func.max(XrefUORM.xref_id))).scalar() - max_xref_id = int(max_xref_id) - max_object_xref_id = xref_dbi.execute( - select(func.max(ObjectXrefUORM.object_xref_id)) - ).scalar() - max_object_xref_id = int(max_object_xref_id) + max_xref_id = int(xref_dbi.execute(select(func.max(XrefUORM.xref_id))).scalar()) + max_object_xref_id = int(xref_dbi.execute(select(func.max(ObjectXrefUORM.object_xref_id))).scalar()) # Get labels, descriptions, and synonyms display_label_to_desc = self.get_display_label_data(dbname, xref_dbi) - synonyms = self.get_synonyms(dbname, xref_dbi) # Get source IDs - dbname_to_source_id = self.get_dbname_to_source_id(dbname, xref_id) + dbname_to_source_id = self.get_dbname_to_source_id(dbname, xref_dbi) + + # Delete old data (from previous run) + logging.info(f"Deleting old data for sources: {', '.join(dbname_to_source_id.keys())}") + self.delete_old_data(dbname_to_source_id.values(), xref_dbi) # Reset gene and transcript stable id display data self.reset_display_xrefs(xref_dbi) @@ -83,13 +94,12 @@ def run(self, species_id: int, verbose: bool) -> None: ) .where( GeneTranscriptTranslationORM.gene_id == GeneStableIdORM.internal_id, - GeneTranscriptTranslationORM.transcript_id - == TranscriptStableIdORM.internal_id, + GeneTranscriptTranslationORM.transcript_id == TranscriptStableIdORM.internal_id, ) .order_by(GeneStableIdORM.stable_id, TranscriptStableIdORM.stable_id) ) for row in xref_dbi.execute(query).mappings().all(): - if not gene_to_transcripts.get(row.gene_id): + if row.gene_id not in gene_to_transcripts: sorted_gene_ids.append(row.gene_id) gene_to_transcripts.setdefault(row.gene_id, []).append(row.transcript_id) @@ -116,7 +126,7 @@ def run(self, species_id: int, verbose: bool) -> None: ObjectXrefUORM.ox_status == "DUMP_OUT", ) for row in xref_dbi.execute(query).mappings().all(): - ignore_object[row.object_xref_id] = 1 + ignore_object[row.object_xref_id] = True xref_added, seen_gene, official_name_used = {}, {}, {} @@ -125,7 +135,7 @@ def run(self, species_id: int, verbose: bool) -> None: transcript_source = dbname gene_symbol, gene_symbol_xref_id, is_lrg = None, None, 0 - # Get offical name if it has one + # Get official name if it has one gene_symbol, gene_symbol_xref_id = self.get_official_domain_name( { "gene_id": gene_id, @@ -138,22 +148,15 @@ def run(self, species_id: int, verbose: bool) -> None: ) if gene_symbol_xref_id: - official_name_used[gene_symbol_xref_id] = 1 + official_name_used[gene_symbol_xref_id] = True # If not found see if there is an LRG entry if not gene_symbol: - gene_symbol, gene_symbol_xref_id, is_lrg = self.find_lrg_hgnc( - gene_id, xref_dbi - ) + gene_symbol, gene_symbol_xref_id, is_lrg = self.find_lrg_hgnc(gene_id, xref_dbi) # If not found look for other valid database sources (RFAM and miRBase, EntrezGene) if not gene_symbol: - ( - gene_symbol, - gene_symbol_xref_id, - transcript_source, - display_label_to_desc, - ) = self.find_from_other_sources( + gene_symbol, gene_symbol_xref_id, transcript_source, display_label_to_desc = self.find_from_other_sources( ignore_object, { "gene_id": gene_id, @@ -173,26 +176,22 @@ def run(self, species_id: int, verbose: bool) -> None: if not is_lrg: # Set transcript names - max_xref_id, max_object_xref_id, xref_added, seen_gene = ( - self.set_transcript_display_xrefs( - { - "max_xref_id": max_xref_id, - "max_object_xref_id": max_object_xref_id, - "gene_id": gene_id, - "gene_id_to_stable_id": gene_id_to_stable_id, - "gene_symbol": gene_symbol, - "description": description, - "source_id": dbname_to_source_id.get( - f"{transcript_source}_trans_name" - ), - "xref_added": xref_added, - "seen_gene": seen_gene, - "transcript_ids": gene_to_transcripts.get(gene_id, []), - "transcript_source": transcript_source, - "species_id": species_id, - }, - xref_dbi, - ) + max_xref_id, max_object_xref_id = self.set_transcript_display_xrefs( + { + "max_xref_id": max_xref_id, + "max_object_xref_id": max_object_xref_id, + "gene_id": gene_id, + "gene_id_to_stable_id": gene_id_to_stable_id, + "gene_symbol": gene_symbol, + "description": description, + "source_id": dbname_to_source_id.get(f"{transcript_source}_trans_name"), + "transcript_ids": gene_to_transcripts.get(gene_id, []), + "transcript_source": transcript_source, + "species_id": species_id, + }, + xref_added, + seen_gene, + xref_dbi, ) xref_dbi.close() @@ -217,49 +216,35 @@ def get_display_label_data(self, dbname: str, dbi: Connection) -> Dict[str, str] XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name.like(dbname) ) for row in dbi.execute(query).mappings().all(): - if not row.description: - no_descriptions += 1 - else: + if row.description: label_to_desc[row.label] = row.description + else: + no_descriptions += 1 if no_descriptions: - logging.warn(f"Descriptions not defined for {no_descriptions} labels") + logging.warning(f"Descriptions not defined for {no_descriptions} labels") return label_to_desc - def get_synonyms(self, dbname: str, dbi: Connection) -> Dict[str, str]: - synonyms = {} - - # Connect synonyms with xref labels - query = select(SynonymORM.synonym, XrefUORM.label).where( - XrefUORM.xref_id == SynonymORM.xref_id, - SourceUORM.source_id == XrefUORM.source_id, - SourceUORM.name.like(dbname), - ) - for row in dbi.execute(query).mappings().all(): - synonyms[row.synonym] = row.label - - return synonyms - def get_dbname_to_source_id(self, dbname: str, dbi: Connection) -> Dict[str, int]: dbname_to_source_id = {} + # List of source names to look for sources_list = [ "RFAM_trans_name", "miRBase_trans_name", "EntrezGene_trans_name", + f"{dbname}_trans_name", ] - sources_list.append(f"{dbname}_trans_name") - sources_list.append(dbname) source_error = 0 for source_name in sources_list: source_id = dbi.execute( - select(SourceUORM.source_id).where(SourceUORM.name.like(source_name)) + select(SourceUORM.source_id).where(SourceUORM.name == source_name) ).scalar() if not source_id: - logging.warn(f"Could not find external database '{source_name}'") + logging.warning(f"Could not find external database '{source_name}'") source_error += 1 else: dbname_to_source_id[source_name] = source_id @@ -271,6 +256,23 @@ def get_dbname_to_source_id(self, dbname: str, dbi: Connection) -> Dict[str, int return dbname_to_source_id + def delete_old_data(self, source_ids_to_delete: List[int], dbi: Connection) -> None: + # Delete from synonym + query = delete(SynonymORM).where(SynonymORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id.in_(source_ids_to_delete)) + dbi.execute(query) + + # Delete from identity_xref + query = delete(IdentityXrefUORM).where(IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, ObjectXrefUORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id.in_(source_ids_to_delete)) + dbi.execute(query) + + # Delete from object_xref + query = delete(ObjectXrefUORM).where(ObjectXrefUORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id.in_(source_ids_to_delete)) + dbi.execute(query) + + # Delete from xref + query = delete(XrefUORM).where(XrefUORM.source_id.in_(source_ids_to_delete)) + dbi.execute(query) + def reset_display_xrefs(self, dbi: Connection) -> None: dbi.execute(update(TranscriptStableIdORM).values(display_xref_id=None)) @@ -311,12 +313,12 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup if row.priority < best_level: display_names.clear() - display_names[row.xref_id] = 1 + display_names[row.xref_id] = True best_level = row.priority elif row.priority == best_level: - display_names[row.xref_id] = 1 + display_names[row.xref_id] = True - # Check if the best names has been found, and remove the others if so + # Check if the best name has been found, and remove the others if so if name_count > 1 and len(display_names) == 1: if verbose: logging.info( @@ -336,13 +338,13 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup # Perfect case, one best name found if len(display_names) == 1: - xref_id = display_names.keys()[0] + xref_id = next(iter(display_names)) return xref_id_to_display[xref_id], xref_id - # Try to find the best names out of multiple ones + # Try to find the best name out of multiple ones if len(display_names) > 1: temp_best_identity = 0 - best_ids, best_list = [], [] + best_ids = {} # Fail xrefs with worse % identity if we can (query or target identity whichever is greater) case_stmt = case( @@ -371,18 +373,15 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup for row in dbi.execute(query).mappings().all(): if row.best_identity > temp_best_identity: best_ids.clear() - best_ids[row.xref_id] = 1 + best_ids[row.xref_id] = True temp_best_identity = row.best_identity elif row.best_identity == temp_best_identity: - best_ids[row.xref_id] = 1 + best_ids[row.xref_id] = True else: break - for xref_id in display_names.keys(): - best_list[xref_id_to_display[xref_id]] = 1 - # Check if we were able to reduce the number of xrefs based on % identity - if len(best_ids) > 0 and len(best_ids) < len(display_names): + if 0 < len(best_ids) < len(display_names): display_names = best_ids if verbose: logging.info( @@ -401,15 +400,12 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup return gene_symbol, gene_symbol_xref_id # Take the name which hasn't been already assigned to another gene, if possible - xref_not_used = None - for xref_id in display_names.keys(): - if not official_name_used.get(xref_id): - xref_not_used = xref_id + xref_not_used = next((xref_id for xref_id in display_names if not official_name_used.get(xref_id)), None) if xref_not_used: if verbose: logging.info(f"For gene {gene_id_to_stable_id[gene_id]}:") - for xref_id in display_names.keys(): + for xref_id in display_names: if xref_id == xref_not_used: if verbose: logging.info(f"\t{xref_id_to_display[xref_id]} chosen") @@ -421,9 +417,8 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)" ) else: - index = 0 - for xref_id in display_names.keys(): - if not index: + for index, xref_id in enumerate(display_names): + if index == 0: if verbose: logging.info( f"\t{xref_id_to_display[xref_id]} chosen as first" @@ -435,11 +430,10 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup logging.info( f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)" ) - index += 1 return gene_symbol, gene_symbol_xref_id - def set_the_best_display_name(self, display_names: Dict[int, int], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]: + def set_the_best_display_name(self, display_names: Dict[int, bool], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]: gene_symbol, gene_symbol_xref_id = None, None for xref_id in xref_list: @@ -462,7 +456,7 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]: gene_symbol, gene_symbol_xref_id = None, None is_lrg = False - # Look for LRG_HGNC_notransfer, if found then find HGNC equiv and set to this + # Look for LRG_HGNC_notransfer, if found then find HGNC equivalent and set to this query = select( XrefUORM.label, XrefUORM.xref_id, @@ -476,11 +470,12 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]: ObjectXrefUORM.ensembl_object_type == "Gene", ) for row in dbi.execute(query).mappings().all(): - # Set status to NO_DISPLAY as we do not want this transferred, just the equivalent hgnc + # Set status to NO_DISPLAY as we do not want this transferred, just the equivalent HGNC self.update_object_xref_status(row.object_xref_id, "NO_DISPLAY") - new_xref_id, priority = None, None - query = ( + # Find the equivalent HGNC xref + new_xref_id = None + result = dbi.execute( select(XrefUORM.xref_id, SourceUORM.priority) .where( XrefUORM.xref_id == ObjectXrefUORM.xref_id, @@ -490,10 +485,9 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]: ObjectXrefUORM.ox_status == "DUMP_OUT", ) .order_by(SourceUORM.priority) - ) - result = dbi.execute(query).fetchall() + ).fetchall() if result: - new_xref_id, priority = result[0] + new_xref_id = result[0][0] if new_xref_id: gene_symbol = row.label @@ -502,7 +496,7 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]: return gene_symbol, gene_symbol_xref_id, is_lrg - def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any], dbi: Connection) -> Tuple[str, int, str, Dict[str, str]]: + def find_from_other_sources(self, ignore: Dict[int, bool], args: Dict[str, Any], dbi: Connection) -> Tuple[str, int, str, Dict[str, str]]: gene_id = args["gene_id"] display_label_to_desc = args["display_label_to_desc"] transcript_source = args["transcript_source"] @@ -510,6 +504,7 @@ def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any], gene_symbol, gene_symbol_xref_id = None, None other_name_number, found_gene = {}, {} + # Iterate through the list of databases to find gene symbols for dbname in ["miRBase", "RFAM", "EntrezGene"]: query = select( XrefUORM.label, @@ -549,7 +544,7 @@ def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any], return gene_symbol, gene_symbol_xref_id, transcript_source, display_label_to_desc - def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> Tuple[int, int, Dict[str, int], Dict[str, int]]: + def set_transcript_display_xrefs(self, args: Dict[str, Any], xref_added: Dict[str, int], seen_gene: Dict[str, int], dbi: Connection) -> Tuple[int, int]: max_xref_id = args["max_xref_id"] max_object_xref_id = args["max_object_xref_id"] gene_id = args["gene_id"] @@ -557,21 +552,17 @@ def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> gene_symbol = args["gene_symbol"] description = args["description"] source_id = args["source_id"] - xref_added = args["xref_added"] - seen_gene = args["seen_gene"] transcript_ids = args["transcript_ids"] transcript_source = args["transcript_source"] species_id = args["species_id"] - # Do nothing is LRG + # Do nothing if LRG if re.search("LRG", gene_id_to_stable_id.get(gene_id)): - return + return max_xref_id, max_object_xref_id - ext = 201 - if seen_gene.get(gene_symbol): - ext = seen_gene[gene_symbol] + ext = seen_gene.get(gene_symbol, 201) - # Go thourgh transcripts + # Go through transcripts for transcript_id in transcript_ids: transcript_name = f"{gene_symbol}-{ext}" @@ -581,7 +572,7 @@ def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> ) index = f"{transcript_name}:{source_id}" - if not xref_added.get(index): + if index not in xref_added: # Add new xref for the transcript name max_xref_id += 1 dbi.execute( @@ -634,4 +625,4 @@ def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> seen_gene[gene_symbol] = ext - return max_xref_id, max_object_xref_id, xref_added, seen_gene + return max_xref_id, max_object_xref_id diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py index 53832520c..dccdd08f8 100644 --- a/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py +++ b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py @@ -14,8 +14,24 @@ """Mapper module for processing sequence matched xref data.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import os +import re +import logging +from sqlalchemy import select, insert, update, func +from sqlalchemy.engine import Connection + +from ensembl.xrefs.xref_update_db_model import ( + TranscriptStableId as TranscriptStableIdORM, + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + Mapping as MappingORM, + MappingJobs as MappingJobsORM, +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class ProcessMappings(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -51,9 +67,7 @@ def process_mappings(self) -> None: MappingJobsORM.job_id, ) for mapping_job in xref_dbi.execute(mapping_query).mappings().all(): - root_dir = mapping_job.root_dir - if root_dir is None: - root_dir = "" + root_dir = mapping_job.root_dir or "" err_file = os.path.join(root_dir, mapping_job.err_file) out_file = os.path.join(root_dir, mapping_job.out_file) @@ -63,61 +77,62 @@ def process_mappings(self) -> None: if mapping_job.status == "SUCCESS": already_processed_count += 1 - else: - if os.path.exists(err_file) and os.path.getsize(err_file) > 0: - error_count += 1 + continue - # Display errors on STDERR - logging.warning(f"Problem {err_file} is non zero") - try: - with open(err_file) as fh: - for line in fh: - logging.warning(f"#{line}") - except: - logging.debug( - f"No error file exists {err_file}???\n Resubmit this job" - ) + if os.path.exists(err_file) and os.path.getsize(err_file) > 0: + error_count += 1 - if mapping_job.status == "SUBMITTED": - update_status = "FAILED" - else: - # Process the mapping file - if os.path.exists(map_file): - count = self.process_map_file( - map_file, - query_cutoff[mapping_job.job_id], - target_cutoff[mapping_job.job_id], - mapping_job.job_id, - mapping_job.array_number, - xref_dbi, - ) - if count > 0: - processed_count += 1 - update_status = "SUCCESS" - elif count == 0: - processed_count += 1 - empty_count += 1 - update_status = "SUCCESS" - else: - error_count += 1 - update_status = "FAILED" + # Display errors on STDERR + logging.warning(f"Problem {err_file} is non zero") + try: + with open(err_file) as fh: + for line in fh: + logging.warning(f"#{line}") + except Exception as e: + logging.debug( + f"No error file exists {err_file}???\n Resubmit this job. Error: {e}" + ) + + if mapping_job.status == "SUBMITTED": + update_status = "FAILED" + else: + # Process the mapping file + if os.path.exists(map_file): + count = self.process_map_file( + map_file, + query_cutoff[mapping_job.job_id], + target_cutoff[mapping_job.job_id], + mapping_job.job_id, + mapping_job.array_number, + xref_dbi, + ) + if count > 0: + processed_count += 1 + update_status = "SUCCESS" + elif count == 0: + processed_count += 1 + empty_count += 1 + update_status = "SUCCESS" else: error_count += 1 - logging.debug( - f"Could not open map file {map_file}???\n Resubmit this job" - ) update_status = "FAILED" - - # Update mapping job status - if update_status: - xref_dbi.execute( - update(MappingJobsORM) - .where( - MappingJobsORM.job_id == mapping_job.job_id, - MappingJobsORM.array_number == mapping_job.array_number, - ) - .values(status=update_status) + else: + error_count += 1 + logging.debug( + f"Could not open map file {map_file}???\n Resubmit this job" + ) + update_status = "FAILED" + + # Update mapping job status + if update_status: + xref_dbi.execute( + update(MappingJobsORM) + .where( + MappingJobsORM.job_id == mapping_job.job_id, + MappingJobsORM.array_number == mapping_job.array_number, ) + .values(status=update_status) + ) logging.info( f"Already processed = {already_processed_count}, processed = {processed_count}, errors = {error_count}, empty = {empty_count}" @@ -136,13 +151,12 @@ def process_map_file(self, map_file: str, query_cutoff: int, target_cutoff: int, # Get max object xref id object_xref_id = dbi.execute( select(func.max(ObjectXrefUORM.object_xref_id)) - ).scalar() - if not object_xref_id: - object_xref_id = 0 + ).scalar() or 0 total_lines, last_query_id = 0, 0 - best_match_found, best_identity, best_score = 0, 0, 0 - first = 1 + best_match_found = False + best_identity, best_score = 0, 0 + first = True mRNA_biotypes = { "protein_coding": 1, @@ -153,210 +167,208 @@ def process_map_file(self, map_file: str, query_cutoff: int, target_cutoff: int, } try: - mh = open(map_file) - except: - logging.debug(f"Could not open map file {map_file}\n Resubmit this job") - return -1 + with open(map_file) as mh: + for line in mh: + load_object_xref = False + total_lines += 1 - for line in mh: - load_object_xref = 0 - total_lines += 1 - - ( - label, - query_id, - target_id, - identity, - query_length, - target_length, - query_start, - query_end, - target_start, - target_end, - cigar_line, - score, - ) = line.strip().split(":") - - # Fix varibale types (for integer comparisons) - identity = int(identity) - score = int(score) - query_length = int(query_length) - target_length = int(target_length) - query_start = int(query_start) - target_start = int(target_start) - - if last_query_id != query_id: - best_match_found = 0 - best_score = 0 - best_identity = 0 - else: - # Ignore mappings with worse identity or score if we already found a good mapping - if ( - identity < best_identity or score < best_score - ) and best_match_found: - continue - - if ensembl_type == "Translation": - load_object_xref = 1 - else: - # Check if source name is RefSeq_ncRNA or RefSeq_mRNA - # If yes check biotype, if ok store object xref - source_name = dbi.execute( - select(SourceUORM.name) - .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) - .where(XrefUORM.xref_id == query_id) - ).scalar() - - if source_name and ( - re.search(r"^RefSeq_(m|nc)RNA", source_name) - or re.search(r"^miRBase", source_name) - or re.search(r"^RFAM", source_name) - ): - # Make sure mRNA xrefs are matched to protein_coding biotype only - biotype = dbi.execute( - select(TranscriptStableIdORM.biotype).where( - TranscriptStableIdORM.internal_id == target_id - ) - ).scalar() - - if re.search(r"^RefSeq_mRNA", source_name) and mRNA_biotypes.get( - biotype - ): - load_object_xref = 1 - if re.search( - r"^RefSeq_ncRNA", source_name - ) and not mRNA_biotypes.get(biotype): - load_object_xref = 1 - if ( - re.search(r"^miRBase", source_name) - or re.search(r"^RFAM", source_name) - ) and re.search("RNA", biotype): - load_object_xref = 1 - else: - load_object_xref = 1 + ( + label, + query_id, + target_id, + identity, + query_length, + target_length, + query_start, + query_end, + target_start, + target_end, + cigar_line, + score, + ) = line.strip().split(":") + + # Fix variable types (for integer comparisons) + identity = int(identity) + score = int(score) + query_length = int(query_length) + target_length = int(target_length) + query_start = int(query_start) + target_start = int(target_start) + + if last_query_id != query_id: + best_match_found = False + best_score = 0 + best_identity = 0 + else: + # Ignore mappings with worse identity or score if we already found a good mapping + if ( + (identity < best_identity or score < best_score) + and best_match_found + ): + continue + + if ensembl_type == "Translation": + load_object_xref = True + else: + # Check if source name is RefSeq_ncRNA or RefSeq_mRNA + # If yes check biotype, if ok store object xref + source_name = dbi.execute( + select(SourceUORM.name) + .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id) + .where(XrefUORM.xref_id == query_id) + ).scalar() + + if source_name and ( + re.search(r"^RefSeq_(m|nc)RNA", source_name) + or re.search(r"^miRBase", source_name) + or re.search(r"^RFAM", source_name) + ): + # Make sure mRNA xrefs are matched to protein_coding biotype only + biotype = dbi.execute( + select(TranscriptStableIdORM.biotype).where( + TranscriptStableIdORM.internal_id == target_id + ) + ).scalar() + + if re.search(r"^RefSeq_mRNA", source_name) and mRNA_biotypes.get( + biotype + ): + load_object_xref = True + if re.search( + r"^RefSeq_ncRNA", source_name + ) and not mRNA_biotypes.get(biotype): + load_object_xref = True + if ( + re.search(r"^miRBase", source_name) + or re.search(r"^RFAM", source_name) + ) and re.search("RNA", biotype): + load_object_xref = True + else: + load_object_xref = True - last_query_id = query_id + last_query_id = query_id - # Check if found a better match - if score > best_score or identity > best_identity: - best_score = score - best_identity = identity + # Check if found a better match + if score > best_score or identity > best_identity: + best_score = score + best_identity = identity - if not load_object_xref: - continue - else: - best_match_found = 1 + if not load_object_xref: + continue + else: + best_match_found = True - if not score: - self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) - raise ValueError(f"No score on line. Possible file corruption\n{line}") + if not score: + self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) + raise ValueError(f"No score on line. Possible file corruption\n{line}") - # Calculate percentage identities - query_identity = int(100 * identity / query_length) - target_identity = int(100 * identity / target_length) + # Calculate percentage identities + query_identity = int(100 * identity / query_length) + target_identity = int(100 * identity / target_length) - # Only keep alignments where both sequences match cutoff - status = "DUMP_OUT" - if query_identity < query_cutoff or target_identity < target_cutoff: - status = "FAILED_CUTOFF" + # Only keep alignments where both sequences match cutoff + status = "DUMP_OUT" + if query_identity < query_cutoff or target_identity < target_cutoff: + status = "FAILED_CUTOFF" - # Add object xref row - object_xref_id = self.get_object_xref_id( - target_id, query_id, ensembl_type, "SEQUENCE_MATCH", dbi, None, status - ) - if object_xref_id: - continue - else: - try: - object_xref_id = self.add_object_xref( - target_id, - query_id, - ensembl_type, - "SEQUENCE_MATCH", - dbi, - None, - status, + # Add object xref row + object_xref_id = self.get_object_xref_id( + target_id, query_id, ensembl_type, "SEQUENCE_MATCH", dbi, None, status ) - except: - self.update_object_xref_end( - job_id, array_number, object_xref_id, dbi - ) - raise IOError(f"Problem adding object_xref row") - - if first: - self.update_object_xref_start(job_id, array_number, object_xref_id, dbi) - first = 0 - - cigar_line = re.sub(" ", "", cigar_line) - cigar_line = re.sub(r"([MDI])(\d+)", r"\2\1", cigar_line) - - # Add identity xref row - try: - identity_xref_query = insert(IdentityXrefUORM).values( - object_xref_id=object_xref_id, - query_identity=query_identity, - target_identity=target_identity, - hit_start=query_start + 1, - hit_end=query_end, - translation_start=target_start + 1, - translation_end=target_end, - cigar_line=cigar_line, - score=score, - ) - dbi.execute(identity_xref_query) - except: - self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) - raise IOError(f"Problem loading identity_xref") - - master_xref_ids = [query_id] - for master_xref_id in master_xref_ids: - # Get all dependents related to master xref - dep_query = select(DependentXrefUORM.dependent_xref_id).where( - DependentXrefUORM.master_xref_id == master_xref_id - ) - for dep in dbi.execute(dep_query).mappings().all(): - # Add dependent object xref - dep_object_xref_id = self.get_object_xref_id( - target_id, - dep.dependent_xref_id, - ensembl_type, - "DEPENDENT", - dbi, - master_xref_id, - status, - ) - if dep_object_xref_id: + if object_xref_id: continue else: try: - dep_object_xref_id = self.add_object_xref( + object_xref_id = self.add_object_xref( target_id, - dep.dependent_xref_id, + query_id, ensembl_type, - "DEPENDENT", + "SEQUENCE_MATCH", dbi, - master_xref_id, + None, status, ) except: self.update_object_xref_end( job_id, array_number, object_xref_id, dbi ) - raise IOError(f"Problem adding dependent object xref row") + raise IOError(f"Problem adding object_xref row") - # Add dependent identity xref - dbi.execute( - insert(IdentityXrefUORM).values( - object_xref_id=dep_object_xref_id, + if first: + self.update_object_xref_start(job_id, array_number, object_xref_id, dbi) + first = False + + cigar_line = re.sub(" ", "", cigar_line) + cigar_line = re.sub(r"([MDI])(\d+)", r"\2\1", cigar_line) + + # Add identity xref row + try: + identity_xref_query = insert(IdentityXrefUORM).values( + object_xref_id=object_xref_id, query_identity=query_identity, target_identity=target_identity, + hit_start=query_start + 1, + hit_end=query_end, + translation_start=target_start + 1, + translation_end=target_end, + cigar_line=cigar_line, + score=score, ) - ) - - # Get the dependent dependents just in case - master_xref_ids.append(dep.dependent_xref_id) + dbi.execute(identity_xref_query) + except: + self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) + raise IOError(f"Problem loading identity_xref") + + master_xref_ids = [query_id] + for master_xref_id in master_xref_ids: + # Get all dependents related to master xref + dep_query = select(DependentXrefUORM.dependent_xref_id).where( + DependentXrefUORM.master_xref_id == master_xref_id + ) + for dep in dbi.execute(dep_query).mappings().all(): + # Add dependent object xref + dep_object_xref_id = self.get_object_xref_id( + target_id, + dep.dependent_xref_id, + ensembl_type, + "DEPENDENT", + dbi, + master_xref_id, + status, + ) + if dep_object_xref_id: + continue + else: + try: + dep_object_xref_id = self.add_object_xref( + target_id, + dep.dependent_xref_id, + ensembl_type, + "DEPENDENT", + dbi, + master_xref_id, + status, + ) + except: + self.update_object_xref_end( + job_id, array_number, object_xref_id, dbi + ) + raise IOError(f"Problem adding dependent object xref row") + + # Add dependent identity xref + dbi.execute( + insert(IdentityXrefUORM).values( + object_xref_id=dep_object_xref_id, + query_identity=query_identity, + target_identity=target_identity, + ) + ) - mh.close() + # Get the dependent dependents just in case + master_xref_ids.append(dep.dependent_xref_id) + except Exception as e: + logging.debug(f"Could not open map file {map_file}\n Resubmit this job. Error: {e}") + return -1 self.update_object_xref_end(job_id, array_number, object_xref_id, dbi) return total_lines diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py index c086cab01..d4085640e 100644 --- a/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py +++ b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py @@ -14,8 +14,30 @@ """Mapper module for moving xref data onto appriopriate genes.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from typing import List, Tuple, Dict +from sqlalchemy import select, func, update, delete, insert +from sqlalchemy.engine import Connection + +from ensembl.xrefs.xref_update_db_model import ( + GeneTranscriptTranslation as GeneTranscriptTranslationORM, + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + TranslationStableId as TranslationStableIdORM, + ObjectXref as ObjectXrefUORM, + AltAllele as AltAlleleUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + GeneDirectXref as GeneDirectXrefORM, + TranscriptDirectXref as TranscriptDirectXrefORM, + TranslationDirectXref as TranslationDirectXrefORM, + Synonym as SynonymORM, + PrimaryXref as PrimaryXrefORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class ProcessMoves(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -28,11 +50,11 @@ def biomart_testing(self, verbose: bool) -> None: xref_dbi = self.xref().connect() - again = 1 + again = True while again: - again = 0 + again = False - last_type, last_count, last_name = None, None, "DEFAULT" + last_type, last_name = None, "DEFAULT" query = ( select( @@ -48,18 +70,15 @@ def biomart_testing(self, verbose: bool) -> None: .group_by(SourceUORM.name, ObjectXrefUORM.ensembl_object_type) ) for row in xref_dbi.execute(query).mappings().all(): - if again: - break - if last_name == row.name: - again = 1 + again = True self.biomart_fix( row.name, last_type, row.ensembl_object_type, xref_dbi ) + break last_name = row.name last_type = row.ensembl_object_type - last_count = row.count if self.unlinked_entries(verbose, xref_dbi): raise ValueError("Problems found before source_defined_move") @@ -70,60 +89,41 @@ def biomart_testing(self, verbose: bool) -> None: def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool: failed = False - xref_id, count = None, None self.update_process_status("tests_started") + def log_problems(count, description, query): + nonlocal failed + if count: + failed = True + logging.error(f"Problem with {count} {description}s") + if verbose: + for row in dbi.execute(query).mappings().all(): + logging.error(f"Problem with {description} {row.log_xref_id}") + # Get count of unlinked master xrefs count = dbi.execute( select(func.count(DependentXrefUORM.master_xref_id)) .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id) .where(XrefUORM.xref_id == None) ).scalar() - - if count: - failed = True - logging.error(f"Problem with {count} master xrefs") - - if verbose: - query = ( - select(DependentXrefUORM.master_xref_id.distinct()) - .outerjoin( - XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id - ) - .where(XrefUORM.xref_id == None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error(f"Problem with master xref {row.master_xref_id}") + log_problems(count, "master xref", + select(DependentXrefUORM.master_xref_id.distinct().label("log_xref_id")) + .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id) + .where(XrefUORM.xref_id == None) + .limit(10)) # Get count of unlinked dependent xrefs count = dbi.execute( select(func.count(DependentXrefUORM.dependent_xref_id)) - .outerjoin( - XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id - ) + .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id) .where(XrefUORM.xref_id == None) ).scalar() - - if count: - failed = True - logging.error(f"Problem with {count} dependent xrefs") - - if verbose: - query = ( - select(DependentXrefUORM.dependent_xref_id.distinct()) - .outerjoin( - XrefUORM, - XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id, - ) - .where(XrefUORM.xref_id == None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error( - f"Problem with dependent xref {row.dependent_xref_id}" - ) + log_problems(count, "dependent xref", + select(DependentXrefUORM.dependent_xref_id.distinct().label("log_xref_id")) + .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id) + .where(XrefUORM.xref_id == None) + .limit(10)) # Get count of unlinked primary xrefs count = dbi.execute( @@ -131,59 +131,31 @@ def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool: .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id) .where(XrefUORM.xref_id == None) ).scalar() - - if count: - failed = True - logging.error(f"Problem with {count} primary xrefs") - - if verbose: - query = ( - select(PrimaryXrefORM.xref_id.distinct()) - .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id) - .where(XrefUORM.xref_id == None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error(f"Problem with primary xref {row.xref_id}") + log_problems(count, "primary xref", + select(PrimaryXrefORM.xref_id.distinct().label("log_xref_id")) + .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id) + .where(XrefUORM.xref_id == None) + .limit(10)) db_tables = { + "transcript": {"direct": TranscriptDirectXrefORM, "stable_id": TranscriptStableIdORM}, + "translation": {"direct": TranslationDirectXrefORM, "stable_id": TranslationStableIdORM}, "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM}, - "transcript": { - "direct": TranscriptDirectXrefORM, - "stable_id": TranscriptStableIdORM, - }, - "translation": { - "direct": TranslationDirectXrefORM, - "stable_id": TranslationStableIdORM, - }, } # Get count of unlinked direct xrefs - for object_type in ["transcript", "translation", "gene"]: - direct_table = db_tables[object_type]["direct"] + for object_type, tables in db_tables.items(): + direct_table = tables["direct"] count = dbi.execute( select(func.count(direct_table.general_xref_id)) .outerjoin(XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id) .where(XrefUORM.xref_id == None) ).scalar() - - if count: - failed = True - logging.error(f"Problem with {count} {object_type} direct xrefs") - - if verbose: - query = ( - select(direct_table.general_xref_id.distinct()) - .outerjoin( - XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id - ) - .where(XrefUORM.xref_id == None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error( - f"Problem with {object_type} direct xref {row.general_xref_id}" - ) + log_problems(count, f"{object_type} direct xref", + select(direct_table.general_xref_id.distinct().label("log_xref_id")) + .outerjoin(XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id) + .where(XrefUORM.xref_id == None) + .limit(10)) # Get count of unlinked synonyms count = dbi.execute( @@ -191,280 +163,219 @@ def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool: .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id) .where(XrefUORM.xref_id == None) ).scalar() - - if count: - failed = True - logging.error(f"Problem with {count} synonyms") - - if verbose: - query = ( - select(SynonymORM.xref_id.distinct()) - .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id) - .where(XrefUORM.xref_id == None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error(f"Problem with synonym {row.xref_id}") + log_problems(count, "synonym", + select(SynonymORM.xref_id.distinct().label("log_xref_id")) + .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id) + .where(XrefUORM.xref_id == None) + .limit(10)) # Get count of unlinked identity object xrefs count = dbi.execute( select(func.count(IdentityXrefUORM.object_xref_id)) - .outerjoin( - ObjectXrefUORM, - ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, - ) + .outerjoin(ObjectXrefUORM, ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id) .where(ObjectXrefUORM.object_xref_id == None) ).scalar() - - if count: - failed = True - logging.error(f"Problem with {count} object xrefs") - - if verbose: - query = ( - select(IdentityXrefUORM.object_xref_id.distinct()) - .outerjoin( - ObjectXrefUORM, - ObjectXrefUORM.object_xref_id - == IdentityXrefUORM.object_xref_id, - ) - .where(ObjectXrefUORM.object_xref_id == None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error(f"Problem with object xref {row.object_xref_id}") + log_problems(count, "object xref", + select(IdentityXrefUORM.object_xref_id.distinct().label("log_xref_id")) + .outerjoin(ObjectXrefUORM, ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id) + .where(ObjectXrefUORM.object_xref_id == None) + .limit(10)) # Get count of unlinked objects - for object_type in ["transcript", "translation", "gene"]: + for object_type, tables in db_tables.items(): id_column = getattr(GeneTranscriptTranslationORM, f"{object_type}_id") - stable_id_table = db_tables[object_type]["stable_id"] + stable_id_table = tables["stable_id"] count = dbi.execute( select(func.count(id_column)) .outerjoin(stable_id_table, stable_id_table.internal_id == id_column) .where(stable_id_table.internal_id == None, id_column != None) ).scalar() + log_problems(count, f"{object_type}_ids", + select(id_column.label("object_id").distinct()) + .outerjoin(stable_id_table, stable_id_table.internal_id == id_column) + .where(stable_id_table.internal_id == None, id_column != None) + .limit(10)) - if count: - failed = True - logging.error(f"Problem with {count} {object_type}_ids") - - if verbose: - query = ( - select(id_column.label("object_id").distinct()) - .outerjoin( - stable_id_table, stable_id_table.internal_id == id_column - ) - .where(stable_id_table.internal_id == None, id_column != None) - .limit(10) - ) - for row in dbi.execute(query).mappings().all(): - logging.error(f"Problem with {object_type}_id {row.object_id}") - - if not failed: - self.update_process_status("tests_finished") - else: - self.update_process_status("tests_failed") + self.update_process_status("tests_finished" if not failed else "tests_failed") return failed def source_defined_move(self, verbose: bool) -> None: - xref_dbi = self.xref().connect() + logging.info("Starting source defined move") - for source in self.get_gene_specific_list(xref_dbi): - self.biomart_fix(source, "Translation", "Gene", xref_dbi) - self.biomart_fix(source, "Transcript", "Gene", xref_dbi) + with self.xref().connect() as xref_dbi: + for source in self.get_gene_specific_list(xref_dbi): + logging.info(f"Processing source: {source}") + self.biomart_fix(source, "Translation", "Gene", xref_dbi) + self.biomart_fix(source, "Transcript", "Gene", xref_dbi) - if self.unlinked_entries(verbose, xref_dbi): - raise ValueError("Problems found after source_defined_move") - - xref_dbi.close() + if self.unlinked_entries(verbose, xref_dbi): + raise ValueError("Problems found after source_defined_move") self.update_process_status("source_level_move_finished") + logging.info("Source defined move finished") def get_gene_specific_list(self, dbi: Connection) -> List[str]: sources_list = [ - "DBASS3", - "DBASS5", - "EntrezGene", - "miRBase", - "RFAM", - "TRNASCAN_SE", - "RNAMMER", - "UniGene", - "Uniprot_gn", - "WikiGene", - "MIM_GENE", - "MIM_MORBID", - "HGNC", - "MGI", - "ZFIN_ID", - "FlyBaseName_gene", - "RGD", - "SGD_GENE", - "VGNC", - "wormbase_gseqname", - "wormbase_locus", - "Xenbase", - "GeneCards", + "DBASS3", "DBASS5", "EntrezGene", "miRBase", "RFAM", "TRNASCAN_SE", + "RNAMMER", "UniGene", "Uniprot_gn", "WikiGene", "MIM_GENE", "MIM_MORBID", + "HGNC", "MGI", "ZFIN_ID", "FlyBaseName_gene", "RGD", "SGD_GENE", "VGNC", + "wormbase_gseqname", "wormbase_locus", "Xenbase", "GeneCards", ] - used_list = [] - count = None - # Check that the sources are used in the database considered - for source in sources_list: - count = dbi.execute( + used_list = [ + source for source in sources_list + if dbi.execute( select(func.count(XrefUORM.xref_id)).where( XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name == source, ) - ).scalar() - - if count > 0: - used_list.append(source) + ).scalar() > 0 + ] return used_list def process_alt_alleles(self, verbose: bool) -> None: logging.info("Processing alt alleles") - xref_dbi = self.xref().connect() + with self.xref().connect() as xref_dbi: + alt_to_ref, ref_to_alts = self.get_alt_allele_hashes(xref_dbi) + gene_specific_list = self.get_gene_specific_list(xref_dbi) - alt_to_ref, ref_to_alts = self.get_alt_allele_hashes(xref_dbi) - gene_specific_list = self.get_gene_specific_list(xref_dbi) + move_count, del_identity_xref_count, del_object_xref_count = 0, 0, 0 - move_count, del_identity_xref_count, del_object_xref_count = 0, 0, 0 + for gene_id, ref_gene in alt_to_ref.items(): + # Move the xrefs onto the reference Gene + query = ( + update(ObjectXrefUORM) + .where( + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.ensembl_id == gene_id, + ObjectXrefUORM.ensembl_object_type == "Gene", + ObjectXrefUORM.ox_status == "DUMP_OUT", + SourceUORM.name.in_(gene_specific_list), + ) + .values(ensembl_id=ref_gene) + .prefix_with("IGNORE") + ) + row_count = xref_dbi.execute(query).rowcount + move_count += row_count - for gene_id, ref_gene in alt_to_ref.items(): - # Move the xrefs onto the reference Gene - query = ( - update(ObjectXrefUORM) - .where( + # Delete the related identity and object xrefs + query = delete(IdentityXrefUORM).where( XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, ObjectXrefUORM.xref_id == XrefUORM.xref_id, ObjectXrefUORM.ensembl_id == gene_id, ObjectXrefUORM.ensembl_object_type == "Gene", ObjectXrefUORM.ox_status == "DUMP_OUT", SourceUORM.name.in_(gene_specific_list), ) - .values(ensembl_id=ref_gene) - .prefix_with("IGNORE") - ) - row_count = xref_dbi.execute(query).rowcount - move_count += row_count - - # Delete the related identity and object xrefs - query = delete(IdentityXrefUORM).where( - XrefUORM.source_id == SourceUORM.source_id, - ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id, - ObjectXrefUORM.xref_id == XrefUORM.xref_id, - ObjectXrefUORM.ensembl_id == gene_id, - ObjectXrefUORM.ensembl_object_type == "Gene", - ObjectXrefUORM.ox_status == "DUMP_OUT", - SourceUORM.name.in_(gene_specific_list), - ) - row_count = xref_dbi.execute(query).rowcount - del_identity_xref_count += row_count - - query = delete(ObjectXrefUORM).where( - XrefUORM.source_id == SourceUORM.source_id, - ObjectXrefUORM.xref_id == XrefUORM.xref_id, - ObjectXrefUORM.ensembl_id == gene_id, - ObjectXrefUORM.ensembl_object_type == "Gene", - ObjectXrefUORM.ox_status == "DUMP_OUT", - SourceUORM.name.in_(gene_specific_list), - ) - row_count = xref_dbi.execute(query).rowcount - del_object_xref_count += row_count - - logging.info( - f"Number of rows: moved = {move_count}, identity_xrefs deleted = {del_identity_xref_count}, object_xrefs deleted = {del_object_xref_count}" - ) - - max_object_xref_id = xref_dbi.execute( - select(func.max(ObjectXrefUORM.object_xref_id)) - ).scalar() - max_object_xref_id = int(max_object_xref_id) - - if not max_object_xref_id: - raise LookupError("Problem getting max object_xref_id") + row_count = xref_dbi.execute(query).rowcount + del_identity_xref_count += row_count - added_count, ignored = 0, 0 - - # Copy the xref data related to the reference gene onto the alt alleles - for ref_gene, alts in ref_to_alts.items(): - # Get object and identity xref data related to the reference gene - query = ( - select(ObjectXrefUORM, IdentityXrefUORM) - .outerjoin( - IdentityXrefUORM, - IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, - ) - .where( + query = delete(ObjectXrefUORM).where( XrefUORM.source_id == SourceUORM.source_id, ObjectXrefUORM.xref_id == XrefUORM.xref_id, - ObjectXrefUORM.ensembl_id == ref_gene, - ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_id == gene_id, ObjectXrefUORM.ensembl_object_type == "Gene", + ObjectXrefUORM.ox_status == "DUMP_OUT", SourceUORM.name.in_(gene_specific_list), ) + row_count = xref_dbi.execute(query).rowcount + del_object_xref_count += row_count + + logging.info( + f"Number of rows: moved = {move_count}, identity_xrefs deleted = {del_identity_xref_count}, object_xrefs deleted = {del_object_xref_count}" ) - for row in xref_dbi.execute(query).mappings().all(): - for alt in alts: - max_object_xref_id += 1 - - query = insert(ObjectXrefUORM).values( - object_xref_id=max_object_xref_id, - ensembl_id=alt, - ensembl_object_type=row.ensembl_object_type, - xref_id=row.xref_id, - linkage_annotation=row.linkage_annotation, - linkage_type=row.linkage_type, - ox_status=row.ox_status, - unused_priority=row.unused_priority, - master_xref_id=row.master_xref_id, - ) - row_count = xref_dbi.execute(query).rowcount - # Only add identity xref if object_xref was added successfully - if row_count: - added_count += 1 + max_object_xref_id = xref_dbi.execute( + select(func.max(ObjectXrefUORM.object_xref_id)) + ).scalar() + max_object_xref_id = int(max_object_xref_id) - query = insert(IdentityXrefUORM).values( - object_xref_id=max_object_xref_id, - query_identity=row.query_identity, - target_identity=row.target_identity, - hit_start=row.hit_start, - hit_end=row.hit_end, - translation_start=row.translation_start, - translation_end=row.translation_end, - cigar_line=row.cigar_line, - score=row.score, - evalue=row.evalue, - ) - xref_dbi.execute(query) - else: - ignored += 1 + if not max_object_xref_id: + raise LookupError("Problem getting max object_xref_id") - logging.info(f"Added {added_count} new mappings and ignored {ignored}") + added_count, ignored = 0, 0 - if self.unlinked_entries(verbose, xref_dbi): - raise ValueError("Problems found after process_alt_alleles") + # Copy the xref data related to the reference gene onto the alt alleles + for ref_gene, alts in ref_to_alts.items(): + # Get object and identity xref data related to the reference gene + query = ( + select(ObjectXrefUORM, IdentityXrefUORM) + .outerjoin( + IdentityXrefUORM, + IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, + ) + .where( + XrefUORM.source_id == SourceUORM.source_id, + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.ensembl_id == ref_gene, + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.ensembl_object_type == "Gene", + SourceUORM.name.in_(gene_specific_list), + ) + ) + for row in xref_dbi.execute(query).mappings().all(): + for alt in alts: + max_object_xref_id += 1 - xref_dbi.close() + query = insert(ObjectXrefUORM).values( + object_xref_id=max_object_xref_id, + ensembl_id=alt, + ensembl_object_type=row.ensembl_object_type, + xref_id=row.xref_id, + linkage_annotation=row.linkage_annotation, + linkage_type=row.linkage_type, + ox_status=row.ox_status, + unused_priority=row.unused_priority, + master_xref_id=row.master_xref_id, + ) + row_count = xref_dbi.execute(query).rowcount + + # Only add identity xref if object_xref was added successfully + if row_count: + added_count += 1 + + query = insert(IdentityXrefUORM).values( + object_xref_id=max_object_xref_id, + query_identity=row.query_identity, + target_identity=row.target_identity, + hit_start=row.hit_start, + hit_end=row.hit_end, + translation_start=row.translation_start, + translation_end=row.translation_end, + cigar_line=row.cigar_line, + score=row.score, + evalue=row.evalue, + ) + xref_dbi.execute(query) + else: + ignored += 1 + + logging.info(f"Added {added_count} new mappings and ignored {ignored}") + + if self.unlinked_entries(verbose, xref_dbi): + raise ValueError("Problems found after process_alt_alleles") self.update_process_status("alt_alleles_processed") def get_alt_allele_hashes(self, dbi: Connection) -> Tuple[Dict[int, int], Dict[int, List[int]]]: - alt_to_ref, ref_to_alts = {}, {} - last_alt_allele, ref_gene = 0, None + alt_to_ref = {} + ref_to_alts = {} + last_alt_allele = None + ref_gene = None query = select( AltAlleleUORM.alt_allele_id, AltAlleleUORM.gene_id, AltAlleleUORM.is_reference, ).order_by(AltAlleleUORM.alt_allele_id, AltAlleleUORM.is_reference.desc()) + for row in dbi.execute(query).mappings().all(): if row.alt_allele_id != last_alt_allele: # Use the first non-reference gene if there is no reference gene in an alt_allele diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py index 0dcbfdff4..3f30fcb7b 100644 --- a/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py +++ b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py @@ -14,8 +14,22 @@ """Mapper module for processing paired xrefs.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from sqlalchemy import select, update, insert +from sqlalchemy.orm import aliased +from sqlalchemy.engine import Connection + +from ensembl.xrefs.xref_update_db_model import ( + GeneTranscriptTranslation as GeneTranscriptTranslationORM, + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + Pairs as PairsORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class ProcessPaired(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -28,7 +42,6 @@ def process(self) -> None: xref_dbi = self.xref().connect() - object_xref_id = None change = { "translation object xrefs added": 0, "translation object xrefs removed": 0, @@ -67,6 +80,7 @@ def process(self) -> None: ) for row in xref_dbi.execute(query).mappings().all(): # Check if translation is linked to the paired RefSeq peptide + transl_object_xref_id = None if row.translation_id: query = ( select(ObjectXrefUORM.object_xref_id, ObjectXrefUORM.xref_id) @@ -83,8 +97,6 @@ def process(self) -> None: if result.rowcount > 0: object_xref_row = result.mappings().all()[0] transl_object_xref_id = object_xref_row.object_xref_id - else: - transl_object_xref_id = None # If it's already linked we don't have to do anything if not transl_object_xref_id: @@ -167,10 +179,7 @@ def process(self) -> None: ) for row in xref_dbi.execute(query).mappings().all(): if RefSeq_pep_translation.get(row.accession): - found = 0 - for tr_id in RefSeq_pep_translation[row.accession]: - if tr_id == row.ensembl_id: - found = 1 + found = any(tr_id == row.ensembl_id for tr_id in RefSeq_pep_translation[row.accession]) if not found: # This translations's transcript is not matched with the paired RefSeq_mRNA%, @@ -194,23 +203,18 @@ def process(self) -> None: self.update_process_status("processed_pairs") def process_dependents(self, translation_object_xref_id: int, translation_id: int, transcript_id: int, dbi: Connection) -> None: - master_object_xrefs = [] - new_master_object_xref_id = None - master_object_xref_ids = {} - - master_object_xrefs.append(translation_object_xref_id) - master_object_xref_ids[translation_object_xref_id] = 1 + master_object_xrefs = [translation_object_xref_id] + master_object_xref_ids = set(master_object_xrefs) while master_object_xrefs: master_object_xref_id = master_object_xrefs.pop() - dependent_object_xref_id = None MasterObjectXref = aliased(ObjectXrefUORM) DependentObjectXref = aliased(ObjectXrefUORM) - MasterXref = aliased(XrefUORM) DependentXref = aliased(XrefUORM) + # Process dependent xrefs for Translation query = select(DependentObjectXref.object_xref_id.distinct()).where( DependentXref.xref_id == DependentXrefUORM.dependent_xref_id, MasterXref.xref_id == DependentXrefUORM.master_xref_id, @@ -225,10 +229,11 @@ def process_dependents(self, translation_object_xref_id: int, translation_id: in for row in dbi.execute(query).mappings().all(): self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi) - if not master_object_xref_ids.get(row.object_xref_id): - master_object_xref_ids[row.object_xref_id] = 1 + if row.object_xref_id not in master_object_xref_ids: + master_object_xref_ids.add(row.object_xref_id) master_object_xrefs.append(row.object_xref_id) + # Process dependent xrefs for Transcript query = select(DependentObjectXref.object_xref_id.distinct()).where( DependentXref.xref_id == DependentXrefUORM.dependent_xref_id, MasterXref.xref_id == DependentXrefUORM.master_xref_id, @@ -243,6 +248,6 @@ def process_dependents(self, translation_object_xref_id: int, translation_id: in for row in dbi.execute(query).mappings().all(): self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi) - if not master_object_xref_ids.get(row.object_xref_id): - master_object_xref_ids[row.object_xref_id] = 1 + if row.object_xref_id not in master_object_xref_ids: + master_object_xref_ids.add(row.object_xref_id) master_object_xrefs.append(row.object_xref_id) diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py index ba212ddf6..8f418d271 100644 --- a/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py +++ b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py @@ -14,8 +14,21 @@ """Mapper module for processing xref priorities.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from typing import List +from sqlalchemy import select, update, insert, delete, desc +from sqlalchemy.engine import Connection + +from ensembl.xrefs.xref_update_db_model import ( + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + Synonym as SynonymORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class ProcessPriorities(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -51,7 +64,7 @@ def process(self) -> None: # Now ALL object_xrefs have an identity_xref # So we can do a straight join and treat all info_types the same way for name in names: - last_acc, last_name, best_xref_id, last_xref_id, seen = "", "", None, 0, 0 + last_acc, last_name, best_xref_id, last_xref_id, seen = "", "", None, 0, False best_ensembl_id, gone = [], [] query = ( @@ -147,9 +160,7 @@ def process(self) -> None: ) # Copy synonyms across if they are missing - query = select(SynonymORM.synonym).where( - SynonymORM.xref_id == row.xref_id - ) + query = select(SynonymORM.synonym).where(SynonymORM.xref_id == row.xref_id) for synonym_row in ( xref_dbi.execute(query).mappings().all() ): @@ -179,12 +190,11 @@ def process(self) -> None: best_ensembl_id.append(row.ensembl_id) # Best priority failed so another one now found so set dumped - if len(gone) > 0: - if last_name == row.accession: - for x_id in gone: - self.update_xref_dumped( - x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi - ) + if gone and last_name == row.accession: + for x_id in gone: + self.update_xref_dumped( + x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi + ) else: # New xref_id if row.ox_status == "DUMP_OUT": @@ -192,7 +202,7 @@ def process(self) -> None: best_xref_id = row.xref_id best_ensembl_id = [row.ensembl_id] - if len(gone) > 0 and last_name == row.accession: + if gone and last_name == row.accession: for x_id in gone: self.update_xref_dumped( x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi @@ -226,7 +236,7 @@ def get_priority_names(self, dbi: Connection) -> List[str]: for row in dbi.execute(query).mappings().all(): if row.name == last_name and not seen.get(row.name): names.append(row.name) - seen[row.name] = 1 + seen[row.name] = True last_name = row.name return names @@ -238,7 +248,7 @@ def update_xref_dumped(self, xref_id: int, dumped: str, dbi: Connection) -> None def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, dbi: Connection) -> None: master_xrefs = [old_master_xref_id] - recursive = 0 + recursive = False # Create a hash of all possible mappings for this accession ensembl_ids = {} @@ -296,7 +306,7 @@ def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, d for row in dbi.execute(query).mappings().all(): # Remove all mappings to low priority xrefs # Then delete any leftover identity xrefs of it - for ensembl_id in old_ensembl_ids.get(row.ensembl_object_type): + for ensembl_id in old_ensembl_ids.get(row.ensembl_object_type, []): self._detach_object_xref( xref_id, row.dependent_xref_id, @@ -319,7 +329,7 @@ def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, d ) # Loop through all chosen (best) ensembl ids mapped to priority xref, and connect them with object_xrefs - for ensembl_id in ensembl_ids.get(row.ensembl_object_type): + for ensembl_id in ensembl_ids.get(row.ensembl_object_type, []): # Add new object_xref for each best_ensembl_id dbi.execute( insert(ObjectXrefUORM) @@ -357,7 +367,7 @@ def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, d if row.dependent_xref_id != xref_id: master_xrefs.append(row.dependent_xref_id) - recursive = 1 + recursive = True def _detach_object_xref(self, xref_id: int, dependent_xref_id: int, object_type: str, ensembl_id: int, dbi: Connection) -> None: # Drop all the identity and go xrefs for the dependents of an xref diff --git a/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py index 473af5a69..6d5f682ae 100644 --- a/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py @@ -14,8 +14,7 @@ """Mapper module for processing RNACentral xref data.""" -from ensembl.production.xrefs.mappers.ChecksumMapper import * - +from ensembl.production.xrefs.mappers.ChecksumMapper import ChecksumMapper class RNACentralMapper(ChecksumMapper): def target(self) -> str: diff --git a/src/python/ensembl/production/xrefs/mappers/TestMappings.py b/src/python/ensembl/production/xrefs/mappers/TestMappings.py index 4511741d1..d3d0a319a 100644 --- a/src/python/ensembl/production/xrefs/mappers/TestMappings.py +++ b/src/python/ensembl/production/xrefs/mappers/TestMappings.py @@ -14,8 +14,30 @@ """Mapper module for running validity checks on xref data.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from sqlalchemy import select, func, text + +from ensembl.core.models import ( + Gene as GeneORM, + ObjectXref as ObjectXrefCORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM +) + +from ensembl.xrefs.xref_update_db_model import ( + GeneStableId as GeneStableIdORM, + TranscriptStableId as TranscriptStableIdORM, + TranslationStableId as TranslationStableIdORM, + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + GeneDirectXref as GeneDirectXrefORM, + TranscriptDirectXref as TranscriptDirectXrefORM, + TranslationDirectXref as TranslationDirectXrefORM, + Synonym as SynonymORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class TestMappings(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -40,10 +62,10 @@ def direct_stable_id_check(self) -> int: total_warnings_count = 0 - for object_type in ["gene", "transcript", "translation"]: + for object_type, tables in db_tables.items(): warnings_count = 0 - direct_table = db_tables[object_type]["direct"] - stable_id_table = db_tables[object_type]["stable_id"] + direct_table = tables["direct"] + stable_id_table = tables["stable_id"] query = ( select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count")) @@ -57,7 +79,7 @@ def direct_stable_id_check(self) -> int: .group_by(SourceUORM.name) ) for row in xref_dbi.execute(query).mappings().all(): - logging.warn( + logging.warning( f"{row.name} has {row.count} invalid stable IDs in {object_type}_direct_xref" ) warnings_count += 1 @@ -78,8 +100,15 @@ def xrefs_counts_check(self) -> int: core_count, xref_count = {}, {} # TO DO: sqlalchemy syntax -- can't figure out how to count 2 columns - xref_query = f'SELECT s.name, COUNT(DISTINCT x.xref_id, ox.ensembl_id) AS count FROM xref x, object_xref ox, source s WHERE ox.xref_id = x.xref_id AND x.source_id = s.source_id AND ox_status = "DUMP_OUT" GROUP BY s.name' - for row in xref_dbi.execute(text(xref_query)).mappings().all(): + xref_query = text( + 'SELECT s.name, COUNT(DISTINCT x.xref_id, ox.ensembl_id) AS count ' + 'FROM xref x ' + 'JOIN object_xref ox ON ox.xref_id = x.xref_id ' + 'JOIN source s ON x.source_id = s.source_id ' + 'WHERE ox_status = "DUMP_OUT" ' + 'GROUP BY s.name' + ) + for row in xref_dbi.execute(xref_query).mappings().all(): xref_count[row.name] = row.count query = ( @@ -87,10 +116,8 @@ def xrefs_counts_check(self) -> int: ExternalDbORM.db_name, func.count(ObjectXrefCORM.object_xref_id).label("count"), ) - .where( - XrefCORM.xref_id == ObjectXrefCORM.xref_id, - XrefCORM.external_db_id == ExternalDbORM.external_db_id, - ) + .join(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id) + .join(ExternalDbORM, XrefCORM.external_db_id == ExternalDbORM.external_db_id) .filter((XrefCORM.info_type == None) | (XrefCORM.info_type != "PROJECTION")) .group_by(ExternalDbORM.db_name) ) @@ -102,24 +129,24 @@ def xrefs_counts_check(self) -> int: change = ((xref_count[row.db_name] - row.count) / row.count) * 100 if change > 5: - logging.warn( - f"{row.db_name} has increased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB" + logging.warning( + f"{row.db_name} has increased by {change:.2f}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB" ) warnings_count += 1 elif change < -5: - logging.warn( - f"{row.db_name} has decreased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB" + logging.warning( + f"{row.db_name} has decreased by {change:.2f}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB" ) warnings_count += 1 else: - logging.warn( + logging.warning( f"{row.db_name} xrefs are not in the xref DB but {row.count} are in the core DB" ) warnings_count += 1 for name, count in xref_count.items(): if not core_count.get(name): - logging.warn( + logging.warning( f"{name} has {count} xrefs in the xref DB but none in the core DB" ) warnings_count += 1 @@ -141,53 +168,52 @@ def name_change_check(self, official_name: str = None) -> int: xref_dbi = self.xref().connect() core_dbi = self.core().connect() - query = select( - XrefUORM.label, GeneStableIdORM.internal_id, GeneStableIdORM.stable_id - ).where( - XrefUORM.xref_id == ObjectXrefUORM.object_xref_id, - ObjectXrefUORM.ensembl_object_type == "Gene", - GeneStableIdORM.internal_id == ObjectXrefUORM.ensembl_id, - XrefUORM.source_id == SourceUORM.source_id, - SourceUORM.name.like(f"{official_name}_%"), + # Query to get new names and stable IDs + query = ( + select(XrefUORM.label, GeneStableIdORM.internal_id, GeneStableIdORM.stable_id) + .join(ObjectXrefUORM, XrefUORM.xref_id == ObjectXrefUORM.object_xref_id) + .join(GeneStableIdORM, GeneStableIdORM.internal_id == ObjectXrefUORM.ensembl_id) + .join(SourceUORM, XrefUORM.source_id == SourceUORM.source_id) + .where( + ObjectXrefUORM.ensembl_object_type == "Gene", + SourceUORM.name.like(f"{official_name}_%") + ) ) for row in xref_dbi.execute(query).mappings().all(): new_name[row.internal_id] = row.label id_to_stable_id[row.internal_id] = row.stable_id + # Query to get aliases query = ( select(XrefUORM.label, SynonymORM.synonym) + .join(SynonymORM, XrefUORM.xref_id == SynonymORM.xref_id) + .join(SourceUORM, XrefUORM.source_id == SourceUORM.source_id) .where( - XrefUORM.xref_id == SynonymORM.xref_id, - XrefUORM.source_id == SourceUORM.source_id, - ) - .filter( - (SourceUORM.name.like(f"{official_name}_%")) - | (SourceUORM.name.like("EntrezGene")) + (SourceUORM.name.like(f"{official_name}_%")) | (SourceUORM.name.like("EntrezGene")) ) ) for row in xref_dbi.execute(query).mappings().all(): alias[row.synonym] = row.label - query = select(XrefCORM.display_label, GeneORM.gene_id).where( - XrefCORM.xref_id == GeneORM.display_xref_id, - GeneORM.biotype == "protein_coding", + # Query to get current display labels + query = ( + select(XrefCORM.display_label, GeneORM.gene_id) + .join(GeneORM, XrefCORM.xref_id == GeneORM.display_xref_id) + .where(GeneORM.biotype == "protein_coding") ) for row in core_dbi.execute(query).mappings().all(): if new_name.get(row.gene_id): total_count += 1 if new_name.get(row.gene_id) and new_name[row.gene_id] != row.display_label: - if ( - not alias.get(row.display_label) - or alias.get(row.display_label) != new_name[row.gene_id] - ): - logging.warn( + if not alias.get(row.display_label) or alias.get(row.display_label) != new_name[row.gene_id]: + logging.warning( f"gene ID ({row.gene_id}) {id_to_stable_id[row.gene_id]} new = {new_name[row.gene_id]} old = {row.display_label}" ) warnings_count += 1 if total_count: - logging.warn( + logging.warning( f"{warnings_count} entries with different names out of {total_count} protein coding gene comparisons" ) diff --git a/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py index f518303bb..2b8a97b29 100644 --- a/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py @@ -14,8 +14,7 @@ """Mapper module for processing UniParc xref data.""" -from ensembl.production.xrefs.mappers.ChecksumMapper import * - +from ensembl.production.xrefs.mappers.ChecksumMapper import ChecksumMapper class UniParcMapper(ChecksumMapper): def target(self) -> str: diff --git a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py index be634d870..c95ee7716 100644 --- a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py +++ b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py @@ -14,8 +14,40 @@ """Mapper module for loading xref data into the core DB.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +import re +from datetime import datetime +from sqlalchemy import select, func, update, delete +from sqlalchemy.dialects.mysql import insert +from sqlalchemy.orm import sessionmaker, aliased, Session +from sqlalchemy.engine import Connection +from sqlalchemy.exc import SQLAlchemyError +from typing import Any, Dict + +from ensembl.core.models import ( + Gene as GeneORM, + ObjectXref as ObjectXrefCORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + UnmappedObject as UnmappedObjectORM, + Analysis as AnalysisORM, + OntologyXref as OntologyXrefORM, + ExternalSynonym as ExternalSynonymORM, + DependentXref as DependentXrefCORM, + IdentityXref as IdentityXrefCORM +) + +from ensembl.xrefs.xref_update_db_model import ( + ObjectXref as ObjectXrefUORM, + Source as SourceUORM, + Xref as XrefUORM, + IdentityXref as IdentityXrefUORM, + DependentXref as DependentXrefUORM, + Synonym as SynonymORM, + PrimaryXref as PrimaryXrefORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class XrefLoader(BasicMapper): def __init__(self, mapper: BasicMapper) -> None: @@ -33,7 +65,8 @@ def update(self, species_name: str) -> None: self.delete_projection_data(core_dbi) # Get the source IDs of relevant external DBs - name_to_external_db_id, source_id_to_external_db_id = {}, {} + name_to_external_db_id = {} + source_id_to_external_db_id = {} query = select(ExternalDbORM.external_db_id, ExternalDbORM.db_name) for row in core_dbi.execute(query).mappings().all(): @@ -46,15 +79,11 @@ def update(self, species_name: str) -> None: ) for row in xref_dbi.execute(query).mappings().all(): if name_to_external_db_id.get(row.name): - source_id_to_external_db_id[row.source_id] = name_to_external_db_id[ - row.name - ] + source_id_to_external_db_id[row.source_id] = name_to_external_db_id[row.name] elif re.search(r"notransfer$", row.name): continue else: - raise LookupError( - f"Could not find {row.name} in external_db table in the core DB" - ) + raise LookupError(f"Could not find {row.name} in external_db table in the core DB") # Reset dumped field in case module is running again xref_dbi.execute( @@ -64,29 +93,16 @@ def update(self, species_name: str) -> None: ) # Delete existing xrefs in core DB (only from relevant sources) - self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi, core_dbi) + self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi) # Get the offsets for xref and object_xref tables - # This is used to track the xrefs whe mapping onto the core DB - xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() - object_xref_offset = core_dbi.execute( - select(func.max(ObjectXrefCORM.object_xref_id)) - ).scalar() + xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() or 0 + object_xref_offset = core_dbi.execute(select(func.max(ObjectXrefCORM.object_xref_id))).scalar() or 0 - if not xref_offset: - xref_offset = 0 - else: - xref_offset = int(xref_offset) self.add_meta_pair("xref_offset", xref_offset) - if not object_xref_offset: - object_xref_offset = 0 - else: - object_xref_offset = int(object_xref_offset) self.add_meta_pair("object_xref_offset", object_xref_offset) - logging.info( - f"DB offsets: xref={xref_offset}, object_xref={object_xref_offset}" - ) + logging.info(f"DB offsets: xref={xref_offset}, object_xref={object_xref_offset}") # Get analysis IDs analysis_ids = self.get_analysis(core_dbi) @@ -110,8 +126,6 @@ def update(self, species_name: str) -> None: .order_by(XrefUORM.xref_id) ) - #### TO DO: transaction - # Get source info from xref DB query = ( select( @@ -133,9 +147,7 @@ def update(self, species_name: str) -> None: # We only care about specific sources if not name_to_external_db_id.get(source_row.name): continue - logging.info( - f"Updating source '{source_row.name}' ({source_row.source_id}) in core" - ) + logging.info(f"Updating source '{source_row.name}' ({source_row.source_id}) in core") where_from = source_row.priority_description if where_from: @@ -144,319 +156,294 @@ def update(self, species_name: str) -> None: external_id = name_to_external_db_id[source_row.name] xref_list = [] - if ( - source_row.info_type == "DIRECT" - or source_row.info_type == "INFERRED_PAIR" - or source_row.info_type == "MISC" - ): - count, last_xref_id = 0, 0 - - # Get all direct, inferred pair and misc xrefs from intermediate DB - query = xref_object_identity_query.where( - XrefUORM.source_id == source_row.source_id, - XrefUORM.info_type == source_row.info_type, - ) - for xref_row in xref_dbi.execute(query).mappings().all(): - xref_id = int(xref_row.xref_id) - object_xref_id = int(xref_row.object_xref_id) - - if last_xref_id != xref_id: - xref_list.append(xref_id) - count += 1 - - # Add xref into core DB - info_text = xref_row.info_text - if not info_text: - info_text = where_from - xref_args = { - "xref_id": xref_id, - "accession": xref_row.accession, - "external_db_id": external_id, - "label": xref_row.label, - "description": xref_row.description, - "version": xref_row.version, - "info_type": xref_row.info_type, - "info_text": info_text, - } - xref_id = self.add_xref(xref_offset, xref_args, core_dbi) - last_xref_id = xref_id - - # Add object xref into core DB - object_xref_args = { - "object_xref_id": object_xref_id, - "ensembl_id": xref_row.ensembl_id, - "ensembl_type": xref_row.ensembl_object_type, - "xref_id": xref_id + xref_offset, - "analysis_id": analysis_ids[xref_row.ensembl_object_type], - } - object_xref_id = self.add_object_xref( - object_xref_offset, object_xref_args, core_dbi - ) + Session = sessionmaker(bind=self.core().execution_options(isolation_level="READ COMMITTED")) + with Session.begin() as session: + try: + if source_row.info_type in ["DIRECT", "INFERRED_PAIR", "MISC"]: + count, last_xref_id = 0, 0 + + # Get all direct, inferred pair and misc xrefs from intermediate DB + query = xref_object_identity_query.where( + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == source_row.info_type, + ) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + info_text = xref_row.info_text or where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": xref_row.label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, session) + last_xref_id = xref_id + + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids[xref_row.ensembl_object_type], + } + object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session) + + # Add identity xref into core DB + if xref_row.translation_start: + query = ( + insert(IdentityXrefCORM) + .values( + object_xref_id=object_xref_id + object_xref_offset, + xref_identity=xref_row.query_identity, + ensembl_identity=xref_row.target_identity, + xref_start=xref_row.hit_start, + xref_end=xref_row.hit_end, + ensembl_start=xref_row.translation_start, + ensembl_end=xref_row.translation_end, + cigar_line=xref_row.cigar_line, + score=xref_row.score, + evalue=xref_row.evalue, + ) + .prefix_with("IGNORE") + ) + session.execute(query) - # Add identity xref into core DB - if xref_row.translation_start: + logging.info(f"\tLoaded {count} {source_row.info_type} xrefs for '{species_name}'") + elif source_row.info_type == "CHECKSUM": + count, last_xref_id = 0, 0 + + # Get all checksum xrefs from intermediate DB + query = xref_object_query.where( + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == source_row.info_type, + ) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + info_text = xref_row.info_text or where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": xref_row.label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, session) + last_xref_id = xref_id + + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids["checksum"], + } + object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session) + + logging.info(f"\tLoaded {count} CHECKSUM xrefs for '{species_name}'") + elif source_row.info_type == "DEPENDENT": + count, last_xref_id, last_ensembl_id, master_error_count = 0, 0, 0, 0 + master_problems = [] + + # Get all dependent xrefs from intermediate DB + MasterXref = aliased(XrefUORM) query = ( - insert(IdentityXrefCORM) - .values( - object_xref_id=object_xref_id + object_xref_offset, - xref_identity=xref_row.query_identity, - ensembl_identity=xref_row.target_identity, - xref_start=xref_row.hit_start, - xref_end=xref_row.hit_end, - ensembl_start=xref_row.translation_start, - ensembl_end=xref_row.translation_end, - cigar_line=xref_row.cigar_line, - score=xref_row.score, - evalue=xref_row.evalue, + select(XrefUORM, ObjectXrefUORM) + .where( + ObjectXrefUORM.ox_status == "DUMP_OUT", + ObjectXrefUORM.xref_id == XrefUORM.xref_id, + ObjectXrefUORM.master_xref_id == MasterXref.xref_id, + MasterXref.source_id == SourceUORM.source_id, + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == "DEPENDENT", ) - .prefix_with("IGNORE") + .order_by(XrefUORM.xref_id, ObjectXrefUORM.ensembl_id, SourceUORM.ordered) ) - core_dbi.execute(query) - - logging.info( - f"\tLoaded {count} {source_row.info_type} xrefs for '{species_name}'" - ) - elif source_row.info_type == "CHECKSUM": - count, last_xref_id = 0, 0 + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + label = xref_row.label or xref_row.accession + info_text = xref_row.info_text or where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, session) + + if last_xref_id != xref_id or last_ensembl_id != xref_row.ensembl_id: + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids[xref_row.ensembl_object_type], + } + object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session) + + if xref_row.master_xref_id: + # Add dependent xref into core DB + session.execute( + insert(DependentXrefCORM) + .values( + object_xref_id=object_xref_id + object_xref_offset, + master_xref_id=xref_row.master_xref_id + xref_offset, + dependent_xref_id=xref_id + xref_offset, + ) + .prefix_with("IGNORE") + ) + else: + if master_error_count < 10: + master_problems.append(xref_row.accession) + master_error_count += 1 + + last_xref_id = xref_id + last_ensembl_id = xref_row.ensembl_id + + if master_problems: + logging.warning( + f"For {source_row.name}, there were {master_error_count} problem master xrefs. Examples are: " + + ", ".join(master_problems) + ) - # Get all checksum xrefs from intermediate DB - query = xref_object_query.where( - XrefUORM.source_id == source_row.source_id, - XrefUORM.info_type == source_row.info_type, - ) - for xref_row in xref_dbi.execute(query).mappings().all(): - xref_id = int(xref_row.xref_id) - object_xref_id = int(xref_row.object_xref_id) - - if last_xref_id != xref_id: - xref_list.append(xref_id) - count += 1 - - # Add xref into core DB - info_text = xref_row.info_text - if not info_text: - info_text = where_from - xref_args = { - "xref_id": xref_id, - "accession": xref_row.accession, - "external_db_id": external_id, - "label": xref_row.label, - "description": xref_row.description, - "version": xref_row.version, - "info_type": xref_row.info_type, - "info_text": info_text, - } - xref_id = self.add_xref(xref_offset, xref_args, core_dbi) - last_xref_id = xref_id - - # Add object xref into core DB - object_xref_args = { - "object_xref_id": object_xref_id, - "ensembl_id": xref_row.ensembl_id, - "ensembl_type": xref_row.ensembl_object_type, - "xref_id": xref_id + xref_offset, - "analysis_id": analysis_ids["checksum"], - } - object_xref_id = self.add_object_xref( - object_xref_offset, object_xref_args, core_dbi - ) + logging.info(f"\tLoaded {count} DEPENDENT xrefs for '{species_name}'") + elif source_row.info_type == "SEQUENCE_MATCH": + count, last_xref_id = 0, 0 - logging.info(f"\tLoaded {count} CHECKSUM xrefs for '{species_name}'") - elif source_row.info_type == "DEPENDENT": - count, last_xref_id, last_ensembl_id, master_error_count = 0, 0, 0, 0 - master_problems = [] - - # Get all dependent xrefs from intermediate DB - MasterXref = aliased(XrefUORM) - query = ( - select(XrefUORM, ObjectXrefUORM) - .where( - ObjectXrefUORM.ox_status == "DUMP_OUT", - ObjectXrefUORM.xref_id == XrefUORM.xref_id, - ObjectXrefUORM.master_xref_id == MasterXref.xref_id, - MasterXref.source_id == SourceUORM.source_id, - XrefUORM.source_id == source_row.source_id, - XrefUORM.info_type == "DEPENDENT", - ) - .order_by( - XrefUORM.xref_id, ObjectXrefUORM.ensembl_id, SourceUORM.ordered - ) - ) - for xref_row in xref_dbi.execute(query).mappings().all(): - xref_id = int(xref_row.xref_id) - object_xref_id = int(xref_row.object_xref_id) - - if last_xref_id != xref_id: - xref_list.append(xref_id) - count += 1 - - # Add xref into core DB - label = xref_row.label - if not label: - label = xref_row.accession - info_text = xref_row.info_text - if not info_text: - info_text = where_from - xref_args = { - "xref_id": xref_id, - "accession": xref_row.accession, - "external_db_id": external_id, - "label": label, - "description": xref_row.description, - "version": xref_row.version, - "info_type": xref_row.info_type, - "info_text": info_text, - } - xref_id = self.add_xref(xref_offset, xref_args, core_dbi) - - if ( - last_xref_id != xref_id - or last_ensembl_id != xref_row.ensembl_id - ): - # Add object xref into core DB - object_xref_args = { - "object_xref_id": object_xref_id, - "ensembl_id": xref_row.ensembl_id, - "ensembl_type": xref_row.ensembl_object_type, - "xref_id": xref_id + xref_offset, - "analysis_id": analysis_ids[xref_row.ensembl_object_type], - } - object_xref_id = self.add_object_xref( - object_xref_offset, object_xref_args, core_dbi + # Get all direct, inferred pair and misc xrefs from intermediate DB + query = xref_object_identity_query.where( + XrefUORM.source_id == source_row.source_id, + XrefUORM.info_type == source_row.info_type, ) - - if xref_row.master_xref_id: - # Add dependent xref into core DB - core_dbi.execute( - insert(DependentXrefCORM) + for xref_row in xref_dbi.execute(query).mappings().all(): + xref_id = int(xref_row.xref_id) + object_xref_id = int(xref_row.object_xref_id) + + if last_xref_id != xref_id: + xref_list.append(xref_id) + count += 1 + + # Add xref into core DB + info_text = xref_row.info_text or where_from + xref_args = { + "xref_id": xref_id, + "accession": xref_row.accession, + "external_db_id": external_id, + "label": xref_row.label, + "description": xref_row.description, + "version": xref_row.version, + "info_type": xref_row.info_type, + "info_text": info_text, + } + xref_id = self.add_xref(xref_offset, xref_args, session) + last_xref_id = xref_id + + # Add object xref into core DB + object_xref_args = { + "object_xref_id": object_xref_id, + "ensembl_id": xref_row.ensembl_id, + "ensembl_type": xref_row.ensembl_object_type, + "xref_id": xref_id + xref_offset, + "analysis_id": analysis_ids[xref_row.ensembl_object_type], + } + object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session) + + # Add identity xref into core DB + query = ( + insert(IdentityXrefCORM) .values( object_xref_id=object_xref_id + object_xref_offset, - master_xref_id=xref_row.master_xref_id - + xref_offset, - dependent_xref_id=xref_id + xref_offset, + xref_identity=xref_row.query_identity, + ensembl_identity=xref_row.target_identity, + xref_start=xref_row.hit_start, + xref_end=xref_row.hit_end, + ensembl_start=xref_row.translation_start, + ensembl_end=xref_row.translation_end, + cigar_line=xref_row.cigar_line, + score=xref_row.score, + evalue=xref_row.evalue, ) .prefix_with("IGNORE") ) - else: - if master_error_count < 10: - master_problems.append(xref_row.accession) + session.execute(query) - master_error_count += 1 + logging.info(f"\tLoaded {count} SEQUENCE_MATCH xrefs for '{species_name}'") + else: + logging.debug(f"\tPROBLEM: what type is {source_row.info_type}") - last_xref_id = xref_id - last_ensembl_id = xref_row.ensembl_id - - if len(master_problems) > 0: - logging.warn( - f"For {source_row.name}, there were {master_error_count} problem master xrefs. Examples are: " - + ", ".join(master_problems) - ) + # Transfer synonym data + if xref_list: + syn_count = 0 - logging.info(f"\tLoaded {count} DEPENDENT xrefs for '{species_name}'") - elif source_row.info_type == "SEQUENCE_MATCH": - count, last_xref_id = 0, 0 - - # Get all direct, inferred pair and misc xrefs from intermediate DB - query = xref_object_identity_query.where( - XrefUORM.source_id == source_row.source_id, - XrefUORM.info_type == source_row.info_type, - ) - for xref_row in xref_dbi.execute(query).mappings().all(): - xref_id = int(xref_row.xref_id) - object_xref_id = int(xref_row.object_xref_id) - - if last_xref_id != xref_id: - xref_list.append(xref_id) - count += 1 - - # Add xref into core DB - info_text = xref_row.info_text - if not info_text: - info_text = where_from - xref_args = { - "xref_id": xref_id, - "accession": xref_row.accession, - "external_db_id": external_id, - "label": xref_row.label, - "description": xref_row.description, - "version": xref_row.version, - "info_type": xref_row.info_type, - "info_text": info_text, - } - xref_id = self.add_xref(xref_offset, xref_args, core_dbi) - last_xref_id = xref_id - - # Add object xref into core DB - object_xref_args = { - "object_xref_id": object_xref_id, - "ensembl_id": xref_row.ensembl_id, - "ensembl_type": xref_row.ensembl_object_type, - "xref_id": xref_id + xref_offset, - "analysis_id": analysis_ids[xref_row.ensembl_object_type], - } - object_xref_id = self.add_object_xref( - object_xref_offset, object_xref_args, core_dbi - ) - - # Add identity xref into core DB - query = ( - insert(IdentityXrefCORM) - .values( - object_xref_id=object_xref_id + object_xref_offset, - xref_identity=xref_row.query_identity, - ensembl_identity=xref_row.target_identity, - xref_start=xref_row.hit_start, - xref_end=xref_row.hit_end, - ensembl_start=xref_row.translation_start, - ensembl_end=xref_row.translation_end, - cigar_line=xref_row.cigar_line, - score=xref_row.score, - evalue=xref_row.evalue, + # Get synonyms + query = select(SynonymORM.xref_id, SynonymORM.synonym).where( + SynonymORM.xref_id.in_(xref_list) ) - .prefix_with("IGNORE") - ) - core_dbi.execute(query) - - logging.info( - f"\tLoaded {count} SEQUENCE_MATCH xrefs for '{species_name}'" - ) - else: - logging.debug(f"\tPROBLEM: what type is {source_row.info_type}") + for syn_row in xref_dbi.execute(query).mappings().all(): + session.execute( + insert(ExternalSynonymORM).values( + xref_id=syn_row.xref_id + xref_offset, + synonym=syn_row.synonym, + ) + ) + syn_count += 1 - # Transfer synonym data - if len(xref_list) > 0: - syn_count = 0 + logging.info(f"\tLoaded {syn_count} synonyms for '{species_name}'") - # Get synonyms - query = select(SynonymORM.xref_id, SynonymORM.synonym).where( - SynonymORM.xref_id.in_(xref_list) - ) - for syn_row in xref_dbi.execute(query).mappings().all(): - core_dbi.execute( - insert(ExternalSynonymORM).values( - xref_id=syn_row.xref_id + xref_offset, - synonym=syn_row.synonym, + # Set dumped status + xref_dbi.execute( + update(XrefUORM) + .values(dumped="MAPPED") + .where(XrefUORM.xref_id.in_(xref_list)) ) - ) - - syn_count += 1 - logging.info(f"\tLoaded {syn_count} synonyms for '{species_name}'") - - # Set dumped status - xref_dbi.execute( - update(XrefUORM) - .values(dumped="MAPPED") - .where(XrefUORM.xref_id.in_(xref_list)) - ) + # Update release info + if source_row.source_release and source_row.source_release != "1": + session.execute( + update(ExternalDbORM) + .values(db_release=source_row.source_release) + .where(ExternalDbORM.external_db_id == external_id) + ) - # Update release info - if source_row.source_release and source_row.source_release != "1": - core_dbi.execute( - update(ExternalDbORM) - .values(db_release=source_row.source_release) - .where(ExternalDbORM.external_db_id == external_id) - ) + session.commit() + except SQLAlchemyError as e: + session.rollback() + logging.error(f"Failed to load xrefs for source '{source_row.name}': {e}") + raise RuntimeError(f"Transaction failed for source '{source_row.name}'") # Update the unmapped xrefs self.update_unmapped_xrefs(xref_dbi) @@ -511,7 +498,7 @@ def delete_projection_data(self, dbi: Connection) -> None: f"Deleted all PROJECTIONs rows: {counts['external_synonym']} external_synonyms, {counts['dependent_xref']} dependent_xrefs, {counts['object_xref']} object_xrefs, {counts['xref']} xrefs" ) - def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection, core_dbi: Connection) -> None: + def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection) -> None: # For each external_db to be updated, delete the existing xrefs query = ( select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count")) @@ -522,92 +509,111 @@ def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_db .group_by(SourceUORM.name) ) for row in xref_dbi.execute(query).mappings().all(): - if not name_to_external_db_id.get(row.name): + name = row.name + external_db_id = name_to_external_db_id.get(name) + if not external_db_id: continue - name = row.name - external_db_id = name_to_external_db_id[name] - counts = {"master_dependent_xref": 0, "master_object_xref": 0} + counts = { + "gene": 0, + "external_synonym": 0, + "identity_xref": 0, + "object_xref": 0, + "master_dependent_xref": 0, + "master_object_xref": 0, + "dependent_xref": 0, + "xref": 0, + "unmapped_object": 0, + } logging.info(f"For source '{name}'") - counts["gene"] = core_dbi.execute( - update(GeneORM) - .values(display_xref_id=None, description=None) - .where( - GeneORM.display_xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - logging.info( - f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)" - ) - - counts["external_synonym"] = core_dbi.execute( - delete(ExternalSynonymORM).where( - ExternalSynonymORM.xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - counts["identity_xref"] = core_dbi.execute( - delete(IdentityXrefCORM).where( - IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id, - ObjectXrefCORM.xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - counts["object_xref"] = core_dbi.execute( - delete(ObjectXrefCORM).where( - ObjectXrefCORM.xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - - MasterXref = aliased(XrefCORM) - DependentXref = aliased(XrefCORM) - - query = select( - ObjectXrefCORM.object_xref_id, - DependentXrefCORM.master_xref_id, - DependentXrefCORM.dependent_xref_id, - ).where( - ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id, - MasterXref.xref_id == DependentXrefCORM.master_xref_id, - DependentXref.xref_id == DependentXrefCORM.dependent_xref_id, - MasterXref.external_db_id == external_db_id, - ) - for row in core_dbi.execute(query).mappings().all(): - counts["master_dependent_xref"] += core_dbi.execute( - delete(DependentXrefCORM).where( - DependentXrefCORM.master_xref_id == row.master_xref_id, - DependentXrefCORM.dependent_xref_id == row.dependent_xref_id, + Session = sessionmaker(bind=self.core().execution_options(isolation_level="READ COMMITTED")) + with Session.begin() as session: + try: + counts["gene"] = session.execute( + update(GeneORM) + .values(display_xref_id=None, description=None) + .where( + GeneORM.display_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + logging.info( + f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)" ) - ).rowcount - counts["master_object_xref"] += core_dbi.execute( - delete(ObjectXrefCORM).where( - ObjectXrefCORM.object_xref_id == row.object_xref_id + + counts["external_synonym"] = session.execute( + delete(ExternalSynonymORM).where( + ExternalSynonymORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["identity_xref"] = session.execute( + delete(IdentityXrefCORM).where( + IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id, + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["object_xref"] = session.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + + MasterXref = aliased(XrefCORM) + DependentXref = aliased(XrefCORM) + + query = select( + ObjectXrefCORM.object_xref_id, + DependentXrefCORM.master_xref_id, + DependentXrefCORM.dependent_xref_id, + ).where( + ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id, + MasterXref.xref_id == DependentXrefCORM.master_xref_id, + DependentXref.xref_id == DependentXrefCORM.dependent_xref_id, + MasterXref.external_db_id == external_db_id, ) - ).rowcount + for sub_row in session.execute(query).mappings().all(): + counts["master_dependent_xref"] += session.execute( + delete(DependentXrefCORM).where( + DependentXrefCORM.master_xref_id == sub_row.master_xref_id, + DependentXrefCORM.dependent_xref_id == sub_row.dependent_xref_id, + ) + ).rowcount + counts["master_object_xref"] += session.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.object_xref_id == sub_row.object_xref_id + ) + ).rowcount - counts["dependent_xref"] = core_dbi.execute( - delete(DependentXrefCORM).where( - DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - counts["xref"] = core_dbi.execute( - delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id) - ).rowcount - counts["unmapped_object"] = core_dbi.execute( - delete(UnmappedObjectORM).where( - UnmappedObjectORM.unmapped_object_type == "xref", - UnmappedObjectORM.external_db_id == external_db_id, - ) - ).rowcount + counts["dependent_xref"] = session.execute( + delete(DependentXrefCORM).where( + DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["xref"] = session.execute( + delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id) + ).rowcount + counts["unmapped_object"] = session.execute( + delete(UnmappedObjectORM).where( + UnmappedObjectORM.unmapped_object_type == "xref", + UnmappedObjectORM.external_db_id == external_db_id, + ) + ).rowcount - logging.info( - f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects" - ) + logging.info( + f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects" + ) + + session.commit() + except SQLAlchemyError as e: + session.rollback() + logging.error(f"Failed to delete rows for source '{name}': {e}") + raise RuntimeError(f"Transaction failed for source '{name}'") def get_analysis(self, dbi: Connection) -> Dict[str, int]: analysis_ids = {} @@ -617,19 +623,21 @@ def get_analysis(self, dbi: Connection) -> Dict[str, int]: "Translation": "xrefexonerateprotein", } - for object_type in ["Gene", "Transcript", "Translation"]: - logic_name = type_to_logic_name[object_type] + for object_type, logic_name in type_to_logic_name.items(): analysis_ids[object_type] = self.get_single_analysis(logic_name, dbi) + # Add checksum analysis ID analysis_ids["checksum"] = self.get_single_analysis("xrefchecksum", dbi) return analysis_ids def get_single_analysis(self, logic_name: str, dbi: Connection) -> int: + # Retrieve the analysis ID for the given logic name analysis_id = dbi.execute( select(AnalysisORM.analysis_id).where(AnalysisORM.logic_name == logic_name) ).scalar() + # If the analysis ID does not exist, create a new analysis entry if not analysis_id: Session = sessionmaker(self.core()) with Session.begin() as session: @@ -641,7 +649,7 @@ def get_single_analysis(self, logic_name: str, dbi: Connection) -> int: return analysis_id - def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: + def add_xref(self, offset: int, args: Dict[str, Any], session: Session) -> int: xref_id = args["xref_id"] accession = args["accession"] external_db_id = args["external_db_id"] @@ -651,7 +659,8 @@ def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: info_type = args["info_type"] info_text = args["info_text"] - new_xref_id = dbi.execute( + # Check if the xref already exists + new_xref_id = session.execute( select(XrefCORM.xref_id).where( XrefCORM.dbprimary_acc == accession, XrefCORM.external_db_id == external_db_id, @@ -661,8 +670,9 @@ def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: ) ).scalar() + # If it doesn't exist, insert it if not new_xref_id: - dbi.execute( + session.execute( insert(XrefCORM).values( xref_id=xref_id + offset, external_db_id=external_db_id, @@ -674,19 +684,19 @@ def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: info_text=info_text, ) ) - return xref_id else: return int(new_xref_id) - offset - def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int: + def add_object_xref(self, offset: int, args: Dict[str, Any], session: Session) -> int: object_xref_id = args["object_xref_id"] ensembl_id = args["ensembl_id"] ensembl_type = args["ensembl_type"] xref_id = args["xref_id"] analysis_id = args["analysis_id"] - new_object_xref_id = dbi.execute( + # Check if the object_xref already exists + new_object_xref_id = session.execute( select(ObjectXrefCORM.object_xref_id).where( ObjectXrefCORM.xref_id == xref_id, ObjectXrefCORM.ensembl_object_type == ensembl_type, @@ -695,8 +705,9 @@ def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> ) ).scalar() + # If it doesn't exist, insert it if not new_object_xref_id: - dbi.execute( + session.execute( insert(ObjectXrefCORM).values( object_xref_id=object_xref_id + offset, ensembl_id=ensembl_id, @@ -705,7 +716,6 @@ def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> analysis_id=analysis_id, ) ) - return object_xref_id else: return int(new_object_xref_id) - offset @@ -724,8 +734,7 @@ def update_unmapped_xrefs(self, dbi: Connection) -> None: XrefUORM.info_type == "DIRECT", ) ) - result = dbi.execute(query).fetchall() - xref_ids = [row[0] for row in result] + xref_ids = [row.xref_id for row in dbi.execute(query).mappings().all()] dbi.execute( update(XrefUORM) .values(dumped="UNMAPPED_NO_STABLE_ID") @@ -761,8 +770,7 @@ def update_unmapped_xrefs(self, dbi: Connection) -> None: DependentXref.info_type == "DEPENDENT", ) ) - result = dbi.execute(query).fetchall() - xref_ids = [row[0] for row in result] + xref_ids = [row.xref_id for row in dbi.execute(query).mappings().all()] dbi.execute( update(XrefUORM) .values(dumped="UNMAPPED_MASTER_FAILED") @@ -784,15 +792,14 @@ def update_unmapped_xrefs(self, dbi: Connection) -> None: XrefUORM.info_type == "SEQUENCE_MATCH", ) ) - result = dbi.execute(query).fetchall() - xref_ids = [row[0] for row in result] + xref_ids = [row.xref_id for row in dbi.execute(query).mappings().all()] dbi.execute( update(XrefUORM) .values(dumped="UNMAPPED_NO_MAPPING") .where(XrefUORM.xref_id.in_(xref_ids)) ) - # Dependents with non existent masters (none on time of loading) + # Dependents with non-existent masters (none at the time of loading) dbi.execute( update(XrefUORM) .values(dumped="UNMAPPED_NO_MASTER") diff --git a/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py index b97b858c7..dbf930854 100644 --- a/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py +++ b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py @@ -15,7 +15,6 @@ """Base method module for handling checksums.""" from Bio import SeqIO -from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import hashlib @@ -26,7 +25,6 @@ DEFAULT_BATCH_SIZE = 1000 DEFAULT_LOG_SIZE = 10000 - class ChecksumBasic: def __init__(self, args: Dict[str, Any] = None) -> None: if args is None: @@ -51,36 +49,29 @@ def batch_size(self, batch_size: int = None) -> int: return self._batch_size def run(self, target: str, source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]: - results, tmp_list = [], [] - count, total_count = 0, 0 + results = [] + tmp_list = [] + count = 0 + total_count = 0 batch_size = self.batch_size() for record in SeqIO.parse(target, "fasta"): tmp_list.append(record) count += 1 - if (count % batch_size) == 0: - res = self.perform_mapping(tmp_list, source_id, object_type, dbi) - for row in res: - results.append(row) - + if count % batch_size == 0: + results.extend(self.perform_mapping(tmp_list, source_id, object_type, dbi)) total_count += count - if total_count % DEFAULT_LOG_SIZE: - self.mapper().log_progress( - f"Finished batch mapping of {total_count} sequences" - ) + if total_count % DEFAULT_LOG_SIZE == 0: + self.mapper().log_progress(f"Finished batch mapping of {total_count} sequences") count = 0 tmp_list.clear() # Final mapping if there were some left over - if len(tmp_list) > 0: - self.mapper().log_progress( - f"Finished batch mapping of {total_count} sequences" - ) - res = self.perform_mapping(tmp_list, source_id, object_type, dbi) - for row in res: - results.append(row) - tmp_list.clear() + if tmp_list: + results.extend(self.perform_mapping(tmp_list, source_id, object_type, dbi)) + total_count += count + self.mapper().log_progress(f"Finished batch mapping of {total_count} sequences") return results diff --git a/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py index 993753cd6..ed02c65ba 100644 --- a/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py +++ b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py @@ -14,11 +14,14 @@ """Base method module for handling mysql checksums.""" -from ensembl.production.xrefs.mappers.methods.ChecksumBasic import * - from sqlalchemy import select +from typing import Any, Dict, List +from Bio.SeqRecord import SeqRecord +from sqlalchemy.engine import Connection + from ensembl.xrefs.xref_source_db_model import ChecksumXref as ChecksumXrefSORM +from ensembl.production.xrefs.mappers.methods.ChecksumBasic import ChecksumBasic class MySQLChecksum(ChecksumBasic): def perform_mapping(self, sequences: List[SeqRecord], source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]: @@ -32,13 +35,15 @@ def perform_mapping(self, sequences: List[SeqRecord], source_id: int, object_typ ChecksumXrefSORM.checksum == checksum, ChecksumXrefSORM.source_id == source_id, ) - for row in dbi.execute(query).mappings().all(): - local_upi = row.accession - if upi: - raise LookupError( - f"The sequence {sequence.id} had a checksum of {checksum} but this resulted in more than one UPI: [{upi}, {local_upi}]" - ) - upi = local_upi + results = dbi.execute(query).mappings().all() + + if len(results) > 1: + upis = [row.accession for row in results] + raise LookupError( + f"The sequence {sequence.id} had a checksum of {checksum} but this resulted in more than one UPI: {upis}" + ) + elif results: + upi = results[0].accession if upi: final_results.append( diff --git a/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py index 3a2b20dbd..13ca0972b 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py +++ b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py @@ -14,8 +14,10 @@ """Mapper extension module for species aedes_aegypti.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class aedes_aegypti(BasicMapper): def gene_description_sources(self) -> List[str]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py index 46e30cf99..0191d0b61 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py +++ b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py @@ -14,8 +14,10 @@ """Mapper extension module for species anopheles_gambiae.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class anopheles_gambiae(BasicMapper): def gene_description_sources(self) -> List[str]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py index 36a5f6696..77725da25 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py +++ b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py @@ -14,8 +14,10 @@ """Mapper extension module for species culex_quinquefasciatus.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class culex_quinquefasciatus(BasicMapper): def gene_description_sources(self) -> List[str]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py index 3a2b155ec..af81a04e9 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py +++ b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py @@ -14,10 +14,9 @@ """Mapper extension module for species danio_rerio.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs - class danio_rerio(BasicMapper): def set_display_xrefs(self) -> None: display = DisplayXrefs(self) diff --git a/src/python/ensembl/production/xrefs/mappers/species/drosophila.py b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py index 2e327a735..3d1b5fb83 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/drosophila.py +++ b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py @@ -14,8 +14,10 @@ """Mapper extension module for species drosophila.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class drosophila(BasicMapper): def gene_description_filter_regexps(self) -> List[str]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py index 1791da9c5..00be97b20 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py +++ b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py @@ -14,8 +14,29 @@ """Mapper extension module for species eukaryota.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from typing import Dict, List, Tuple +from sqlalchemy.orm import aliased +from sqlalchemy import select, update, func, delete +from sqlalchemy.sql.expression import Select +from sqlalchemy.dialects.mysql import insert + +from ensembl.xrefs.xref_update_db_model import ( + Source as SourceUORM, + Xref as XrefUORM, + DependentXref as DependentXrefUORM, + ObjectXref as ObjectXrefUORM +) + +from ensembl.core.models import ( + Gene as GeneORM, + Transcript as TranscriptORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + ObjectXref as ObjectXrefCORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class eukaryota(BasicMapper): def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py index 616bd7326..b19bbf9be 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py +++ b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py @@ -14,8 +14,8 @@ """Mapper extension module for species homo_sapiens.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper +from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs class homo_sapiens(BasicMapper): def official_name(self) -> str: diff --git a/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py index 5861e03a7..10cc0d739 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py +++ b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py @@ -14,8 +14,10 @@ """Mapper extension module for species ixodes_scapularis.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class ixodes_scapularis(BasicMapper): def gene_description_sources(self) -> List[str]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py index cde22b34f..307b0c7b0 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py +++ b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py @@ -14,8 +14,8 @@ """Mapper extension module for species mus_musculus.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper +from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs class mus_musculus(BasicMapper): def official_name(self) -> str: diff --git a/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py index df2bb072c..93725c0b6 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py +++ b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py @@ -14,8 +14,10 @@ """Mapper extension module for species neurospora_crassa.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class neurospora_crassa(BasicMapper): def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/parasite.py b/src/python/ensembl/production/xrefs/mappers/species/parasite.py index 408d84d08..81fd43753 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/parasite.py +++ b/src/python/ensembl/production/xrefs/mappers/species/parasite.py @@ -14,8 +14,9 @@ """Mapper extension module for species parasite.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import List +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class parasite(BasicMapper): def set_transcript_names(self) -> None: diff --git a/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py index 53925875d..4ab5a86f3 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py +++ b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py @@ -14,8 +14,8 @@ """Mapper extension module for species rattus_norvegicus.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper +from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs class rattus_norvegicus(BasicMapper): def official_name(self) -> str: diff --git a/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py index 707dcc7db..088cf7f42 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py +++ b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py @@ -14,8 +14,10 @@ """Mapper extension module for species saccharomyces_cerevisiae.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class saccharomyces_cerevisiae(BasicMapper): def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py index 742f1207c..c8547e1af 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py +++ b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py @@ -14,8 +14,18 @@ """Mapper extension module for species sars_cov_2.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +import logging +from sqlalchemy import delete, insert, select, update, func + +from ensembl.core.models import ( + Gene as GeneORM, + Transcript as TranscriptORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + ObjectXref as ObjectXrefCORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class sars_cov_2(BasicMapper): def set_transcript_names(self) -> None: diff --git a/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py index 8c7d66d8e..c0f9adfbd 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py +++ b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py @@ -14,8 +14,10 @@ """Mapper extension module for species schizosaccharomyces_pombe.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +from typing import Dict, List, Tuple +from sqlalchemy.sql.expression import Select +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class schizosaccharomyces_pombe(BasicMapper): def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]: diff --git a/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py index a3182e7f7..fb733b7a6 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py +++ b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py @@ -14,8 +14,8 @@ """Mapper extension module for species sus_scrofa.""" -from ensembl.production.xrefs.mappers.BasicMapper import * - +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper +from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs class sus_scrofa(BasicMapper): def official_name(self) -> str: diff --git a/src/python/ensembl/production/xrefs/mappers/species/wormbase.py b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py index 796d6260e..c666bc3bc 100644 --- a/src/python/ensembl/production/xrefs/mappers/species/wormbase.py +++ b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py @@ -14,8 +14,19 @@ """Mapper extension module for species wormbase.""" -from ensembl.production.xrefs.mappers.BasicMapper import * +import logging +from typing import List +from sqlalchemy.sql.expression import select, update +from ensembl.core.models import ( + Gene as GeneORM, + Transcript as TranscriptORM, + Xref as XrefCORM, + ExternalDb as ExternalDbORM, + ObjectXref as ObjectXrefCORM +) + +from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper class wormbase(BasicMapper): def set_display_xrefs(self) -> None: @@ -41,7 +52,6 @@ def set_display_xrefs(self) -> None: "Could not find wormbase_transcript and wormbase_locus in external_db table, so doing nothing" ) - xref_dbi.close() core_dbi.close() return diff --git a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py index 53e78e887..f7e166a5d 100644 --- a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py @@ -30,7 +30,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: species_id = args.get("species_id") species_name = args.get("species_name") xref_file = args.get("file", "") - dba = args.get("dba") + db_url = args.get("extra_db_url") ensembl_release = args.get("ensembl_release") xref_dbi = args.get("xref_dbi") verbose = args.get("verbose", False) @@ -62,7 +62,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: # Connect to the appropriate arrayexpress db arrayexpress_db_url = self.get_arrayexpress_db_url( - project, db_user, db_pass, db_host, db_port, db_name, species_name, ensembl_release, dba, verbose + project, db_user, db_pass, db_host, db_port, db_name, species_name, ensembl_release, db_url, verbose ) if not arrayexpress_db_url: @@ -118,7 +118,7 @@ def is_arryaexpress_active(self, species_lookup: Dict[str, bool], names: List[st return True return False - def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_host: str, db_port: str, db_name: str, species_name: str, ensembl_release: str, dba: str, verbose: bool) -> Optional[URL]: + def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_host: str, db_port: str, db_name: str, species_name: str, ensembl_release: str, db_url: str, verbose: bool) -> Optional[URL]: if db_host: return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name) elif project == "ensembl": @@ -130,13 +130,13 @@ def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_h if verbose: logging.info("Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2") registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160" - db_url = self.get_db_from_registry(species_name, "core", ensembl_release, registry) - if not db_url: + sta_db_url = self.get_db_from_registry(species_name, "core", ensembl_release, registry) + if not sta_db_url: registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275" return self.get_db_from_registry(species_name, "core", ensembl_release, registry) + return sta_db_url + elif db_url: return db_url - elif dba: - return dba return None diff --git a/src/python/ensembl/production/xrefs/parsers/BaseParser.py b/src/python/ensembl/production/xrefs/parsers/BaseParser.py index ad6440e37..baae025ed 100644 --- a/src/python/ensembl/production/xrefs/parsers/BaseParser.py +++ b/src/python/ensembl/production/xrefs/parsers/BaseParser.py @@ -142,7 +142,7 @@ def set_release(self, source_id: int, s_release: str, dbi: Connection) -> None: .values(source_release=s_release) ) - def upload_xref_object_graphs(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None: + def add_xref_objects(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None: """Adds xref data into a database. Uploads main xref data, related direct xrefs, dependent xrefs, and synonyms. @@ -356,45 +356,6 @@ def add_direct_xref(self, general_xref_id: int, ensembl_stable_id: str, ensembl_ ) ) - def add_to_direct_xrefs(self, args: Dict[str, Any], dbi: Connection) -> None: - """Adds direct xref data into both the xref table and direct xref tables in a database. - This calls the functions add_xref and add_direct_xref. - - Parameters - ---------- - args: dict - The direct xref arguments. These include: - - stable_id: The ensEMBL feature stable ID - - ensembl_type: The feature type (gene, transcript, or translation) - - accession: The xref accession - - source_id: The xref source ID - - species_id: The species ID - - version (optional): The xref version (default is 0) - - label (optional): The xref label (default is the xref accession) - - description (optional): The xref description - - linkage (optional): The type of link between the xref and ensEMBL - - info_text (optional): Additional info related to the xref (default is empty string) - - info_type (optional): The type of xref being added (default is DIRECT) - dbi: sqlalchemy.engine.Connection - The database connection to update in - """ - stable_id = args["stable_id"] - ensembl_type = args["ensembl_type"] - accession = args["accession"] - source_id = args["source_id"] - species_id = args["species_id"] - version = args.get("version", 0) - label = args.get("label", accession) - description = args.get("description") - linkage = args.get("linkage") - info_text = args.get("info_text", "") - - args["info_type"] = args.get("info_type", "DIRECT") - - # If the accession already has an xref find it else cretae a new one - direct_xref_id = self.add_xref(args, dbi) - self.add_direct_xref(direct_xref_id, stable_id, ensembl_type, linkage, dbi) - def get_direct_xref_id(self, stable_id: str, ensembl_type: str, link: str, dbi: Connection) -> int: """Retrieves the direct xref row ID from stable ID, ensEMBL type and linkage type. @@ -710,7 +671,7 @@ def build_dependent_mappings(self, source_id: int, dbi: Connection) -> None: f"{row.master_xref_id}|{row.dependent_xref_id}" ] = row.linkage_annotation - def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]: + def get_acc_to_xref_ids(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]: """Retrieves the xref accessions and IDs related to a specific xref source and species from a database. Parameters @@ -726,7 +687,7 @@ def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> ------- A dict variable containing {'accession' : [list of xref IDs]} items. """ - valid_codes = {} + acc_to_xref_ids = {} sources = [] big_name = "%" + source_name.upper() + "%" @@ -741,9 +702,9 @@ def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> XrefUORM.species_id == species_id, XrefUORM.source_id == source_id ) for row in dbi.execute(query).fetchall(): - valid_codes.setdefault(row[0], []).append(row[1]) + acc_to_xref_ids.setdefault(row[0], []).append(row[1]) - return valid_codes + return acc_to_xref_ids def is_file_header_valid(self, columns_count: int, field_patterns: List[str], header: List[str], case_sensitive: bool = False) -> bool: """Checks whether the provided file header is valid by checking length and column patterns. @@ -780,7 +741,7 @@ def is_file_header_valid(self, columns_count: int, field_patterns: List[str], he return True def add_to_syn(self, accession: str, source_id: int, synonym: str, species_id: int, dbi: Connection) -> None: - """Add synomyn data for an xref given its accession and source ID. + """Adds synomyn data for an xref given its accession and source ID. Parameters ---------- diff --git a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py index 24d1e088c..159638916 100644 --- a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py +++ b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py @@ -33,7 +33,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: source_id = args.get("source_id") species_id = args.get("species_id") xref_file = args.get("file", "") - dba = args.get("dba") + db_url = args.get("extra_db_url") xref_dbi = args.get("xref_dbi") verbose = args.get("verbose", False) @@ -53,8 +53,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: ccds_db_url = URL.create( "mysql", db_user, db_pass, db_host, db_port, db_name ) - elif dba: - ccds_db_url = dba + elif db_url: + ccds_db_url = db_url if not ccds_db_url: return 1, "Could not find CCDS DB." diff --git a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py index 699c633ba..dc0d5720f 100644 --- a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py +++ b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py @@ -35,6 +35,10 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: if not source_id or not species_id or not xref_file: raise AttributeError("Missing required arguments: source_id, species_id, and file") + + wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi) + if verbose: + logging.info(f"Wiki source id = {wiki_source_id}") with self.get_filehandle(xref_file) as file_io: if file_io.read(1) == '': @@ -64,10 +68,6 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header): raise ValueError(f"Malformed or unexpected header in EntrezGene file {xref_file}") - wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi) - if verbose: - logging.info(f"Wiki source id = {wiki_source_id}") - processed_count, syn_count = self.process_lines(csv_reader, source_id, species_id, wiki_source_id, xref_dbi) result_message = f"{processed_count} EntrezGene Xrefs and {processed_count} WikiGene Xrefs added with {syn_count} synonyms" diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py index b8bca4e45..d64eead15 100644 --- a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py +++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py @@ -24,7 +24,7 @@ from sqlalchemy import select from sqlalchemy.engine import Connection from sqlalchemy.engine.url import URL -from unidecode import unidecode +from unidecode import unidecode # type: ignore from ensembl.core.models import ( Transcript as TranscriptORM, @@ -39,7 +39,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: source_id = args.get("source_id") species_id = args.get("species_id") xref_file = args.get("file") - dba = args.get("dba") + db_url = args.get("extra_db_url") xref_dbi = args.get("xref_dbi") verbose = args.get("verbose", False) @@ -66,7 +66,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: name_count = {key: 0 for key in source_ids} # Connect to the ccds db - ccds_db_url = dba or self.construct_db_url(file_params) + ccds_db_url = db_url or self.construct_db_url(file_params) if not ccds_db_url: raise AttributeError("No ensembl ccds database provided") if verbose: @@ -95,11 +95,13 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: result_message += f"{syn_count} synonyms added\n" result_message += f"{name_count['desc_only']} HGNC ids could not be associated in xrefs" + result_message = re.sub(r"\n", "--", result_message) + return 0, result_message def process_lines(self, csv_reader: csv.DictReader, source_ids: Dict[str, int], name_count: Dict[str, int], species_id: int, ccds_db_url: str, xref_dbi: Connection) -> int: # Prepare lookup lists - refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + refseq = self.get_acc_to_xref_ids("refseq", species_id, xref_dbi) source_list = ["refseq_peptide", "refseq_mRNA"] entrezgene = self.get_valid_xrefs_for_dependencies("EntrezGene", source_list, xref_dbi) diff --git a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py index 94dc7466a..c156c6758 100644 --- a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py +++ b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py @@ -60,7 +60,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: } xrefs.append(xref) - self.upload_xref_object_graphs(xrefs, xref_dbi) + self.add_xref_objects(xrefs, xref_dbi) result_message = f"{len(xrefs)} JGI_ xrefs successfully parsed" diff --git a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py index 4a1654fc5..cd63875b4 100644 --- a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py +++ b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py @@ -71,9 +71,9 @@ def process_lines(self, csv_reader: csv.reader, xref_file:str, species_id: int, self.build_dependent_mappings(mim_gene_source_id, xref_dbi) self.build_dependent_mappings(mim_morbid_source_id, xref_dbi) - mim_gene = self.get_valid_codes("MIM_GENE", species_id, xref_dbi) - mim_morbid = self.get_valid_codes("MIM_MORBID", species_id, xref_dbi) - entrez = self.get_valid_codes("EntrezGene", species_id, xref_dbi) + mim_gene = self.get_acc_to_xref_ids("MIM_GENE", species_id, xref_dbi) + mim_morbid = self.get_acc_to_xref_ids("MIM_MORBID", species_id, xref_dbi) + entrez = self.get_acc_to_xref_ids("EntrezGene", species_id, xref_dbi) # Read lines for line in csv_reader: diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py index e760cbf9e..4f13abd3e 100644 --- a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py @@ -17,7 +17,7 @@ import logging import os import re -import wget +import wget # type: ignore from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse from sqlalchemy import and_, select @@ -44,7 +44,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: species_id = args.get("species_id") species_name = args.get("species_name") xref_file = args.get("file") - dba = args.get("dba") + db_url = args.get("extra_db_url") ensembl_release = args.get("ensembl_release") xref_dbi = args.get("xref_dbi") verbose = args.get("verbose", False) @@ -70,7 +70,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: species_name = species_id_to_names[species_id][0] # Connect to the appropriate rfam db - rfam_db_url = self.get_rfam_db_url(db_host, db_user, db_pass, db_port, db_name, dba, species_name, ensembl_release, verbose) + rfam_db_url = self.get_rfam_db_url(db_host, db_user, db_pass, db_port, db_name, db_url, species_name, ensembl_release, verbose) if not rfam_db_url: raise AttributeError("Could not find RFAM DB.") if verbose: @@ -86,11 +86,11 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: result_message = f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs" return 0, result_message - def get_rfam_db_url(self, db_host: str, db_user: str, db_pass: str, db_port: str, db_name: str, dba: str, species_name: str, ensembl_release: str, verbose: bool) -> Any: + def get_rfam_db_url(self, db_host: str, db_user: str, db_pass: str, db_port: str, db_name: str, db_url: str, species_name: str, ensembl_release: str, verbose: bool) -> Any: if db_host: return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name) - elif dba: - return dba + elif db_url: + return db_url else: if verbose: logging.info("Looking for db in mysql-ens-sta-1") diff --git a/src/python/ensembl/production/xrefs/parsers/RGDParser.py b/src/python/ensembl/production/xrefs/parsers/RGDParser.py index 54b574e82..22284fd4a 100644 --- a/src/python/ensembl/production/xrefs/parsers/RGDParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RGDParser.py @@ -55,7 +55,7 @@ def process_lines(self, csv_reader: csv.DictReader, source_id: int, direct_sourc dependent_count, ensembl_count, mismatch_count, syn_count = 0, 0, 0, 0 # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs - preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + preloaded_refseq = self.get_acc_to_xref_ids("refseq", species_id, xref_dbi) for line in csv_reader: # Don't bother doing anything if we don't have an RGD ID or if the symbol is an Ensembl ID diff --git a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py index c00df2cc4..ba6ec9f25 100644 --- a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py @@ -100,7 +100,7 @@ def process_file(self, xref_file: str, alias_to_species_id: Dict[str, int], sour # Get existing uniprot accessions is_uniprot = bool(re.search("UniProt", xref_file)) - uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi) if is_uniprot else {} + uniprot_accessions = self.get_acc_to_xref_ids("uniprot/", species_id, xref_dbi) if is_uniprot else {} with self.get_filehandle(xref_file) as file_io: if file_io.read(1) == '': diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py index 61662dcaf..699a0af83 100644 --- a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py @@ -27,7 +27,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: source_id = args.get("source_id") species_id = args.get("species_id") species_name = args.get("species_name") - dba = args.get("dba") + db_url = args.get("extra_db_url") xref_dbi = args.get("xref_dbi") verbose = args.get("verbose", False) @@ -46,7 +46,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: species_name = species_id_to_names[species_id][0] # Connect to the appropriate dbs - if dba: + if db_url: return self.run_perl_script(args, source_ids, species_name) else: # Not all species have an otherfeatures database, skip if not found @@ -85,14 +85,15 @@ def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], spec logging.info(f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl") perl_cmd = ( - f"perl {scripts_dir}/refseq_coordinate_parser.pl " - f"--xref_db_url '{xref_db_url}' " - f"--core_db_url '{args.get('core_db_url')}' " - f"--otherf_db_url '{args.get('dba')}' " - f"--source_ids '{source_ids_json}' " - f"--species_id {args.get('species_id')} " - f"--species_name {species_name} " - f"--release {args.get('ensembl_release')}" + "perl", + f"{scripts_dir}/refseq_coordinate_parser.pl" + f"--xref_db_url", xref_db_url + f"--core_db_url", args.get('core_db_url'), + f"--otherf_db_url", args.get('extra_db_url'), + f"--source_ids", source_ids_json, + f"--species_id", str(species_id), + f"--species_name", species_name + f"--release", str(args.get('ensembl_release')) ) cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py index f9e62c218..9958c0490 100644 --- a/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py @@ -138,10 +138,10 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi) - refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi) - refseq_ids.update(self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi)) - entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi) - wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi) + refseq_ids = self.get_acc_to_xref_ids("RefSeq_mRNA", species_id, dbi) + refseq_ids.update(self.get_acc_to_xref_ids("RefSeq_mRNA_predicted", species_id, dbi)) + entrez_ids = self.get_acc_to_xref_ids("EntrezGene", species_id, dbi) + wiki_ids = self.get_acc_to_xref_ids("WikiGene", species_id, dbi) xrefs = [] @@ -217,7 +217,7 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name xrefs.append(xref) if xrefs: - self.upload_xref_object_graphs(xrefs, dbi) + self.add_xref_objects(xrefs, dbi) result_message = ( f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, ' diff --git a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py index 1886c6fc6..42c2ceb01 100644 --- a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py +++ b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py @@ -344,12 +344,12 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, xref_file: s xrefs.append(xref) if count > 1000: - self.upload_xref_object_graphs(xrefs, dbi) + self.add_xref_objects(xrefs, dbi) count = 0 xrefs.clear() if xrefs: - self.upload_xref_object_graphs(xrefs, dbi) + self.add_xref_objects(xrefs, dbi) result_message = ( f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, ' diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py index 2792af8ff..66b51dadf 100644 --- a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py +++ b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py @@ -53,8 +53,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: descriptions[row.accession] = row.description # Get the Uniprot and RefSeq accessions - swiss = self.get_valid_codes("uniprot/swissprot", species_id, xref_dbi) - refseq = self.get_valid_codes("refseq", species_id, xref_dbi) + swiss = self.get_acc_to_xref_ids("uniprot/swissprot", species_id, xref_dbi) + refseq = self.get_acc_to_xref_ids("refseq", species_id, xref_dbi) file_dir = os.path.dirname(xref_file) counts = {"direct": 0, "uniprot": 0, "refseq": 0, "synonyms": 0, "mismatch": 0} @@ -142,7 +142,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: counts["mismatch"] += 1 # Get the added ZFINs - zfin = self.get_valid_codes("zfin", species_id, xref_dbi) + zfin = self.get_acc_to_xref_ids("zfin", species_id, xref_dbi) sources = [] query = select(SourceUORM.source_id).where(SourceUORM.name.like("ZFIN_ID")) diff --git a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py index cc90ea85c..7dfc965d2 100644 --- a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py +++ b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py @@ -49,7 +49,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: if not xrefs: return 0, "No xrefs added" - self.upload_xref_object_graphs(xrefs, xref_dbi) + self.add_xref_objects(xrefs, xref_dbi) result_message = f"Read {len(xrefs)} xrefs from {file}" return 0, result_message From 0cdcc5862df4f644ca9ae63192554099b68b6602 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 6 Jan 2025 10:19:44 +0000 Subject: [PATCH 07/12] Test modifications --- src/python/test/xrefs/flatfiles/sources.json | 16 ---------------- .../test/xrefs/parsers/test_ccds_parser.py | 2 +- src/python/test/xrefs/test_checksum.py | 17 +++++++++++++++++ src/python/test/xrefs/test_download_source.py | 10 +++------- .../test/xrefs/test_schedule_alignment.py | 16 +++------------- src/python/test/xrefs/test_schedule_download.py | 2 +- src/python/test/xrefs/test_schedule_parse.py | 11 +++-------- 7 files changed, 28 insertions(+), 46 deletions(-) delete mode 100644 src/python/test/xrefs/flatfiles/sources.json diff --git a/src/python/test/xrefs/flatfiles/sources.json b/src/python/test/xrefs/flatfiles/sources.json deleted file mode 100644 index 1b45a2acb..000000000 --- a/src/python/test/xrefs/flatfiles/sources.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "name" : "ArrayExpress", - "parser" : "ArrayExpressParser", - "file" : "Database", - "db" : "core", - "priority" : 1 - }, - { - "name" : "RNACentral", - "parser" : "ChecksumParser", - "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz", - "db" : "checksum", - "priority" : 1 - } -] \ No newline at end of file diff --git a/src/python/test/xrefs/parsers/test_ccds_parser.py b/src/python/test/xrefs/parsers/test_ccds_parser.py index 4b22225ef..7e10c9a70 100644 --- a/src/python/test/xrefs/parsers/test_ccds_parser.py +++ b/src/python/test/xrefs/parsers/test_ccds_parser.py @@ -25,7 +25,7 @@ def run_and_validate_parsing(ccds_parser: CCDSParser, mock_xref_dbi: DBConnectio { "source_id": SOURCE_ID_CCDS, "species_id": SPECIES_ID_HUMAN, - "dba": "mock_ccds_db_url", + "extra_db_url": "mock_ccds_db_url", "xref_dbi": mock_xref_dbi, } ) diff --git a/src/python/test/xrefs/test_checksum.py b/src/python/test/xrefs/test_checksum.py index 4d86ad0c7..db43bd998 100644 --- a/src/python/test/xrefs/test_checksum.py +++ b/src/python/test/xrefs/test_checksum.py @@ -2,6 +2,7 @@ import os import shutil import datetime +from sqlalchemy import text from typing import Any, Dict, Callable, Optional from ensembl.utils.database import DBConnection from test_helpers import check_row_count @@ -24,6 +25,20 @@ def _create_checksum(args: Optional[Dict[str, Any]] = None) -> Checksum: return Checksum(args, True, True) return _create_checksum +# Function to populate the database with sources +def populate_source_db(mock_source_dbi: DBConnection): + source_data = [ + [1, 'RNACentral', 'ChecksumParser'], + [2, 'UniParc', 'ChecksumParser'], + ] + for row in source_data: + mock_source_dbi.execute( + text("INSERT INTO source (source_id, name, parser) VALUES (:source_id, :name, :parser)"), + {"source_id": row[0], "name": row[1], "parser": row[2],} + ) + + mock_source_dbi.commit() + # Test case to check if an error is raised when a mandatory parameter is missing def test_checksum_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): test_missing_required_param("Checksum", DEFAULT_ARGS, "base_path") @@ -32,6 +47,8 @@ def test_checksum_missing_required_param(test_missing_required_param: Callable[[ # Test case to check successful run def test_successful_run(mock_source_dbi: DBConnection, checksum: Checksum, pytestconfig: pytest.Config): + populate_source_db(mock_source_dbi) + # Setup for test parameters and create a Checksum instance test_scratch_path = pytestconfig.getoption("test_scratch_path") args = { diff --git a/src/python/test/xrefs/test_download_source.py b/src/python/test/xrefs/test_download_source.py index 4e537ab5b..3b4988677 100644 --- a/src/python/test/xrefs/test_download_source.py +++ b/src/python/test/xrefs/test_download_source.py @@ -30,13 +30,9 @@ def _create_download_source(args: Optional[Dict[str, Any]] = None) -> DownloadSo # Test case to check if an error is raised when a mandatory parameter is missing def test_download_source_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "base_path") - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "parser") - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "name") - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "priority") - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "source_db_url") - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "file") - test_missing_required_param("DownloadSource", DEFAULT_ARGS, "skip_download") + required_params = ["base_path", "parser", "name", "priority", "source_db_url", "file", "skip_download"] + for param in required_params: + test_missing_required_param("DownloadSource", DEFAULT_ARGS, param) # Test case to check if an error is raised when an invalid URL scheme is provided def test_invalid_url_scheme(download_source: DownloadSource, pytestconfig): diff --git a/src/python/test/xrefs/test_schedule_alignment.py b/src/python/test/xrefs/test_schedule_alignment.py index 2254a58e2..cbdee9952 100644 --- a/src/python/test/xrefs/test_schedule_alignment.py +++ b/src/python/test/xrefs/test_schedule_alignment.py @@ -35,19 +35,9 @@ def _create_schedule_alignment(args: Optional[Dict[str, Any]] = None) -> Schedul # Test case to check if an error is raised when a mandatory parameter is missing def test_schedule_alignment_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "species_name") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "release") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "ensembl_fasta") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_fasta") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "seq_type") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_db_url") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "base_path") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "method") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "query_cutoff") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "target_cutoff") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_id") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_name") - test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "job_index") + required_params = ["species_name", "release", "ensembl_fasta", "xref_fasta", "seq_type", "xref_db_url", "base_path", "method", "query_cutoff", "target_cutoff", "source_id", "source_name", "job_index"] + for param in required_params: + test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, param) # Test case to check successful run def test_successful_run(schedule_alignment: ScheduleAlignment, pytestconfig: pytest.Config): diff --git a/src/python/test/xrefs/test_schedule_download.py b/src/python/test/xrefs/test_schedule_download.py index 8c17eb123..817a74607 100644 --- a/src/python/test/xrefs/test_schedule_download.py +++ b/src/python/test/xrefs/test_schedule_download.py @@ -76,7 +76,7 @@ def test_successful_run(schedule_download: ScheduleDownload, pytestconfig): user_name = os.getenv("USER", "default_user") test_db_name = f"{user_name}_test_xref_source_db_{timestamp}" args = { - "config_file": "flatfiles/sources.json", + "config_file": "flatfiles/sources_download.json", "source_db_url": f"{test_mysql_url}/{test_db_name}", "reuse_db": False, "dataflow_output_path": test_scratch_path diff --git a/src/python/test/xrefs/test_schedule_parse.py b/src/python/test/xrefs/test_schedule_parse.py index 04b3cd4ca..b34e19a5a 100644 --- a/src/python/test/xrefs/test_schedule_parse.py +++ b/src/python/test/xrefs/test_schedule_parse.py @@ -68,14 +68,9 @@ def populate_source_db(mock_source_dbi: DBConnection): # Test case to check if an error is raised when a mandatory parameter is missing def test_schedule_parse_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]): - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "species_name") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "release") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "registry_url") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "priority") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "source_db_url") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "xref_db_url") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "get_species_file") - test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "sources_config_file") + required_params = ["species_name", "release", "registry_url", "priority", "source_db_url", "xref_db_url", "get_species_file"] + for param in required_params: + test_missing_required_param("ScheduleParse", DEFAULT_ARGS, param) # Test case to check if an error is raised when priority is invalid def test_invalid_priority(schedule_parse: ScheduleParse): From d24e06bcb105aaef12529243b3fbb81e6df94017 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 6 Jan 2025 11:37:19 +0000 Subject: [PATCH 08/12] Bug fixes --- .../production/xrefs/AdvisoryXrefReport.py | 20 +-- .../ensembl/production/xrefs/Alignment.py | 19 +-- src/python/ensembl/production/xrefs/Base.py | 2 +- .../ensembl/production/xrefs/DumpEnsembl.py | 7 +- .../production/xrefs/ScheduleCleanup.py | 2 - .../ensembl/production/xrefs/ScheduleParse.py | 8 +- .../xrefs/mappers/CoordinateMapper.py | 6 +- .../production/xrefs/mappers/DisplayXrefs.py | 9 +- .../xrefs/mappers/OfficialNaming.py | 7 +- .../production/xrefs/mappers/XrefLoader.py | 149 +++++++++--------- .../production/xrefs/parsers/HGNCParser.py | 2 +- .../production/xrefs/parsers/RFAMParser.py | 2 +- .../xrefs/parsers/RefSeqCoordinateParser.py | 33 ++-- .../xrefs/flatfiles/sources_download.json | 16 ++ 14 files changed, 144 insertions(+), 138 deletions(-) create mode 100644 src/python/test/xrefs/flatfiles/sources_download.json diff --git a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py index c12ce0e6b..152fe6976 100644 --- a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py +++ b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py @@ -26,15 +26,17 @@ def run(self): datacheck_name: str = self.get_param("datacheck_name", {"type": str}) datacheck_output: str = self.get_param("datacheck_output", {"type": str}) - # Create or locate report file - report_file = self.get_path( - base_path, species_name, release, "dc_report", f"{datacheck_name}.log" - ) - # Return the quotation marks into the output datacheck_output = re.sub("__", "'", datacheck_output) - # Write datacheck result into file - with open(report_file, "a") as fh: - fh.write(datacheck_output) - fh.write("\n") + # Only interested in failed tests + if re.search("Failed test", datacheck_output): + # Create or locate report file + report_file = self.get_path( + base_path, species_name, release, "dc_report", f"{datacheck_name}.log" + ) + + # Write datacheck result into file + with open(report_file, "a") as fh: + fh.write(datacheck_output) + fh.write("\n") diff --git a/src/python/ensembl/production/xrefs/Alignment.py b/src/python/ensembl/production/xrefs/Alignment.py index 5edac0b00..abea37b53 100644 --- a/src/python/ensembl/production/xrefs/Alignment.py +++ b/src/python/ensembl/production/xrefs/Alignment.py @@ -15,6 +15,7 @@ """Alignment module to map xref sequences into ensEMBL ones.""" import re +import shlex import subprocess from sqlalchemy.dialects.mysql import insert @@ -45,23 +46,11 @@ def run(self): # Construct Exonerate command ryo = "xref:%qi:%ti:%ei:%ql:%tl:%qab:%qae:%tab:%tae:%C:%s\n" exe = subprocess.check_output(["which", "exonerate"]).decode("utf-8").strip() - command_string = [ - exe, - "--showalignment", "FALSE", - "--showvulgar", "FALSE", - "--ryo", f"'{ryo}'", - "--gappedextension", "FALSE", - "--model", "'affine:local'", - method, - "--subopt", "no", - "--query", source, - "--target", target, - "--querychunktotal", str(max_chunks), - "--querychunkid", str(chunk) - ] + command_string = f"{exe} --showalignment FALSE --showvulgar FALSE --ryo '{ryo}' --gappedextension FALSE --model 'affine:local' {method} --subopt no --query {source} --target {target} --querychunktotal {max_chunks} --querychunkid {chunk}" + command_list = shlex.split(command_string) # Get exonerate hits - output = subprocess.run(command_string, stdout=subprocess.PIPE, text=True) + output = subprocess.run(command_list, capture_output=True, text=True) exit_code = abs(output.returncode) if exit_code == 0: diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py index 04aad4971..53a04f2d2 100644 --- a/src/python/ensembl/production/xrefs/Base.py +++ b/src/python/ensembl/production/xrefs/Base.py @@ -21,7 +21,7 @@ import fnmatch import gzip import importlib -import wget # type: ignore +import wget import threading import logging import random diff --git a/src/python/ensembl/production/xrefs/DumpEnsembl.py b/src/python/ensembl/production/xrefs/DumpEnsembl.py index c34635f6d..00b219f87 100644 --- a/src/python/ensembl/production/xrefs/DumpEnsembl.py +++ b/src/python/ensembl/production/xrefs/DumpEnsembl.py @@ -40,18 +40,17 @@ def run(self): logging.info(f"Dna and peptide data already dumped for species '{species_name}', skipping.") else: scripts_dir: str = self.get_param("perl_scripts_dir", {"required": True, "type": str}) + dump_script = os.path.join(scripts_dir, 'dump_ensembl.pl') - logging.info(f"Running perl script {scripts_dir}/dump_ensembl.pl") + logging.info(f"Running perl script {dump_script}") perl_cmd = [ - "perl", - f"{scripts_dir}/dump_ensembl.pl", + "perl", dump_script, "--cdna_path", cdna_path, "--pep_path", pep_path, "--species", species_name, "--core_db_url", core_db_url, "--release", str(release) ] - # subprocess.run(perl_cmd, check=True, stdout=subprocess.PIPE) subprocess.run(perl_cmd, capture_output=True, text=True, check=True) # Create jobs for peptide dumping and alignment diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py index 19388b9fb..9ec5a8b7e 100644 --- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py +++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py @@ -33,14 +33,12 @@ def run(self): source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) clean_files: Optional[bool] = self.get_param("clean_files", {"type": bool}) clean_dir: Optional[str] = self.get_param("clean_dir", {"type": str}) - split_files_by_species: Optional[bool] = self.get_param("split_files_by_species", {"type": bool}) logging.info("ScheduleCleanup starting with parameters:") logging.info(f"Param: base_path = {base_path}") logging.info(f"Param: source_db_url = {source_db_url}") logging.info(f"Param: clean_files = {clean_files}") logging.info(f"Param: clean_dir = {clean_dir}") - logging.info(f"Param: split_files_by_species = {split_files_by_species}") # Connect to source db db_engine = self.get_db_engine(source_db_url) diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py index 149eb1c71..2317025f3 100644 --- a/src/python/ensembl/production/xrefs/ScheduleParse.py +++ b/src/python/ensembl/production/xrefs/ScheduleParse.py @@ -122,6 +122,7 @@ def run(self): hgnc_path = None total_sources = 0 + zfin_scheduled = False for source in sources: if source.name == "HGNC": @@ -129,6 +130,9 @@ def run(self): if source.db == "checksum" or source.priority != order_priority: continue + + if source.name == "ZFIN_ID" and zfin_scheduled: + continue dataflow_params = { "species_name": species_name, @@ -205,6 +209,7 @@ def run(self): if source.name == "ZFIN_ID": list_files = [list_files[0]] + zfin_scheduled = True for file in list_files: if source.revision and file == source.revision: @@ -213,8 +218,7 @@ def run(self): dataflow_params["file_name"] = file if re.search(r"^Uniprot", source.name) and hgnc_path: - - hgnc_files = glob.glob(hgnc_path + "/*") + hgnc_files = glob.glob(os.path.join(hgnc_path, "*")) dataflow_params["hgnc_file"] = hgnc_files[0] self.write_output(dataflow_suffix, dataflow_params) diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py index 6bf44f8bc..e8aac3f77 100644 --- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py @@ -114,10 +114,10 @@ def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir: if analysis_id: logging.info(f"Analysis ID is {analysis_id}") - logging.info(f"Running perl script {scripts_dir}/coordinate_mapper.pl") + mapper_script = os.path.join(scripts_dir, 'coordinate_mapper.pl') + logging.info(f"Running perl script {mapper_script}") perl_cmd = [ - "perl", - f"{scripts_dir}/coordinate_mapper.pl", + "perl", mapper_script, "--xref_db_url", str(self.xref()), "--core_db_url", str(self.core()), "--species_id", str(species_id), diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py index 22b6f61b7..964eb26cd 100644 --- a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py +++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py @@ -18,7 +18,8 @@ import re from datetime import datetime from typing import Dict, List, Tuple -from sqlalchemy import select, insert, update, delete, case, desc, func, aliased +from sqlalchemy import select, insert, update, delete, case, desc, func +from sqlalchemy.orm import aliased from sqlalchemy.engine import Connection from sqlalchemy.sql import Select @@ -744,9 +745,9 @@ def set_display_xrefs_from_stable_table(self) -> None: TranscriptStableIdORM.internal_id, TranscriptStableIdORM.display_xref_id ) for row in xref_dbi.execute(query).mappings().all(): - xref_id = int(row.display_xref_id) + if row.display_xref_id: + xref_id = int(row.display_xref_id) - if xref_id: # Set display xref ID core_dbi.execute( update(TranscriptORM) @@ -757,9 +758,9 @@ def set_display_xrefs_from_stable_table(self) -> None: # Clean up synonyms linked to xrefs which are not display xrefs query = ( select(ExternalSynonymORM) + .join(XrefCORM, XrefCORM.xref_id == ExternalSynonymORM.xref_id) .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id) .where( - ExternalSynonymORM.xref_id == XrefCORM.xref_id, GeneORM.display_xref_id == None, ) ) diff --git a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py index 82768724e..74976506c 100644 --- a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py +++ b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py @@ -17,7 +17,8 @@ import logging import re from typing import Any, Dict, Tuple, List -from sqlalchemy import select, func, update, case, desc, insert, aliased, delete +from sqlalchemy import select, func, update, case, desc, insert, delete +from sqlalchemy.orm import aliased from sqlalchemy.engine import Connection from ensembl.xrefs.xref_update_db_model import ( @@ -436,13 +437,13 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup def set_the_best_display_name(self, display_names: Dict[int, bool], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]: gene_symbol, gene_symbol_xref_id = None, None - for xref_id in xref_list: + for index,xref_id in enumerate(xref_list): # Remove object xrefs that are not in the best display names list if not display_names.get(xref_id): if verbose: logging.info(f"Removing {xref_id_to_display[xref_id]} from gene") self.update_object_xref_status( - object_xref_list[xref_id], "MULTI_DELETE", dbi + object_xref_list[index], "MULTI_DELETE", dbi ) else: if verbose: diff --git a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py index c95ee7716..8e9ba37d8 100644 --- a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py +++ b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py @@ -93,7 +93,7 @@ def update(self, species_name: str) -> None: ) # Delete existing xrefs in core DB (only from relevant sources) - self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi) + self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi, core_dbi) # Get the offsets for xref and object_xref tables xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() or 0 @@ -498,7 +498,7 @@ def delete_projection_data(self, dbi: Connection) -> None: f"Deleted all PROJECTIONs rows: {counts['external_synonym']} external_synonyms, {counts['dependent_xref']} dependent_xrefs, {counts['object_xref']} object_xrefs, {counts['xref']} xrefs" ) - def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection) -> None: + def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection, core_dbi: Connection) -> None: # For each external_db to be updated, delete the existing xrefs query = ( select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count")) @@ -528,92 +528,87 @@ def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_db logging.info(f"For source '{name}'") - Session = sessionmaker(bind=self.core().execution_options(isolation_level="READ COMMITTED")) - with Session.begin() as session: - try: - counts["gene"] = session.execute( - update(GeneORM) - .values(display_xref_id=None, description=None) - .where( - GeneORM.display_xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - logging.info( - f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)" + try: + counts["gene"] = core_dbi.execute( + update(GeneORM) + .values(display_xref_id=None, description=None) + .where( + GeneORM.display_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, ) + ).rowcount + logging.info( + f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)" + ) - counts["external_synonym"] = session.execute( - delete(ExternalSynonymORM).where( - ExternalSynonymORM.xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - counts["identity_xref"] = session.execute( - delete(IdentityXrefCORM).where( - IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id, - ObjectXrefCORM.xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - counts["object_xref"] = session.execute( - delete(ObjectXrefCORM).where( - ObjectXrefCORM.xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, - ) - ).rowcount - - MasterXref = aliased(XrefCORM) - DependentXref = aliased(XrefCORM) - - query = select( - ObjectXrefCORM.object_xref_id, - DependentXrefCORM.master_xref_id, - DependentXrefCORM.dependent_xref_id, - ).where( - ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id, - MasterXref.xref_id == DependentXrefCORM.master_xref_id, - DependentXref.xref_id == DependentXrefCORM.dependent_xref_id, - MasterXref.external_db_id == external_db_id, + counts["external_synonym"] = core_dbi.execute( + delete(ExternalSynonymORM).where( + ExternalSynonymORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, ) - for sub_row in session.execute(query).mappings().all(): - counts["master_dependent_xref"] += session.execute( - delete(DependentXrefCORM).where( - DependentXrefCORM.master_xref_id == sub_row.master_xref_id, - DependentXrefCORM.dependent_xref_id == sub_row.dependent_xref_id, - ) - ).rowcount - counts["master_object_xref"] += session.execute( - delete(ObjectXrefCORM).where( - ObjectXrefCORM.object_xref_id == sub_row.object_xref_id - ) - ).rowcount - - counts["dependent_xref"] = session.execute( + ).rowcount + counts["identity_xref"] = core_dbi.execute( + delete(IdentityXrefCORM).where( + IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id, + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["object_xref"] = core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + + MasterXref = aliased(XrefCORM) + DependentXref = aliased(XrefCORM) + + query = select( + ObjectXrefCORM.object_xref_id, + DependentXrefCORM.master_xref_id, + DependentXrefCORM.dependent_xref_id, + ).where( + ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id, + MasterXref.xref_id == DependentXrefCORM.master_xref_id, + DependentXref.xref_id == DependentXrefCORM.dependent_xref_id, + MasterXref.external_db_id == external_db_id, + ) + for sub_row in core_dbi.execute(query).mappings().all(): + counts["master_dependent_xref"] += core_dbi.execute( delete(DependentXrefCORM).where( - DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id, - XrefCORM.external_db_id == external_db_id, + DependentXrefCORM.master_xref_id == sub_row.master_xref_id, + DependentXrefCORM.dependent_xref_id == sub_row.dependent_xref_id, ) ).rowcount - counts["xref"] = session.execute( - delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id) - ).rowcount - counts["unmapped_object"] = session.execute( - delete(UnmappedObjectORM).where( - UnmappedObjectORM.unmapped_object_type == "xref", - UnmappedObjectORM.external_db_id == external_db_id, + counts["master_object_xref"] += core_dbi.execute( + delete(ObjectXrefCORM).where( + ObjectXrefCORM.object_xref_id == sub_row.object_xref_id ) ).rowcount - logging.info( - f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects" + counts["dependent_xref"] = core_dbi.execute( + delete(DependentXrefCORM).where( + DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id, + XrefCORM.external_db_id == external_db_id, + ) + ).rowcount + counts["xref"] = core_dbi.execute( + delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id) + ).rowcount + counts["unmapped_object"] = core_dbi.execute( + delete(UnmappedObjectORM).where( + UnmappedObjectORM.unmapped_object_type == "xref", + UnmappedObjectORM.external_db_id == external_db_id, ) + ).rowcount - session.commit() - except SQLAlchemyError as e: - session.rollback() - logging.error(f"Failed to delete rows for source '{name}': {e}") - raise RuntimeError(f"Transaction failed for source '{name}'") + logging.info( + f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects" + ) + except SQLAlchemyError as e: + logging.error(f"Failed to delete existing rows for source '{name}': {e}") + raise RuntimeError(f"Failed to delete existing rows for source '{name}': {e}") def get_analysis(self, dbi: Connection) -> Dict[str, int]: analysis_ids = {} diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py index d64eead15..21df867fe 100644 --- a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py +++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py @@ -24,7 +24,7 @@ from sqlalchemy import select from sqlalchemy.engine import Connection from sqlalchemy.engine.url import URL -from unidecode import unidecode # type: ignore +from unidecode import unidecode from ensembl.core.models import ( Transcript as TranscriptORM, diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py index 4f13abd3e..7988534fd 100644 --- a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py @@ -17,7 +17,7 @@ import logging import os import re -import wget # type: ignore +import wget from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse from sqlalchemy import and_, select diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py index 699a0af83..a7ce38729 100644 --- a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py +++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py @@ -14,6 +14,7 @@ """Parser module for RefSeq coordinate xrefs.""" +import os import json import logging import subprocess @@ -47,7 +48,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]: # Connect to the appropriate dbs if db_url: - return self.run_perl_script(args, source_ids, species_name) + return self.run_perl_script(args, source_ids, species_id, species_name) else: # Not all species have an otherfeatures database, skip if not found return 0, f"Skipped. No otherfeatures database for '{species_name}'." @@ -74,7 +75,7 @@ def get_source_ids(self, verbose: bool, xref_dbi: Connection) -> Dict[str, int]: return source_ids - def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], species_name: str) -> Tuple[int, str]: + def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], species_id: int, species_name: str) -> Tuple[int, str]: # For now, we run a perl script to add the xrefs, which has some mandatory arguments scripts_dir = args.get("perl_scripts_dir") xref_db_url = args.get("xref_db_url") @@ -83,22 +84,22 @@ def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], spec source_ids_json = json.dumps(source_ids) - logging.info(f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl") - perl_cmd = ( - "perl", - f"{scripts_dir}/refseq_coordinate_parser.pl" - f"--xref_db_url", xref_db_url - f"--core_db_url", args.get('core_db_url'), - f"--otherf_db_url", args.get('extra_db_url'), - f"--source_ids", source_ids_json, - f"--species_id", str(species_id), - f"--species_name", species_name - f"--release", str(args.get('ensembl_release')) - ) - cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + parser_script = os.path.join(scripts_dir, 'refseq_coordinate_parser.pl') + logging.info(f"Running perl script {parser_script}") + perl_cmd = [ + "perl", parser_script, + "--xref_db_url", xref_db_url, + "--core_db_url", args.get('core_db_url'), + "--otherf_db_url", args.get('extra_db_url'), + "--source_ids", source_ids_json, + "--species_id", str(species_id), + "--species_name", species_name, + "--release", str(args.get('ensembl_release')) + ] + cmd_output = subprocess.run(perl_cmd, capture_output=True, text=True) if cmd_output.returncode != 0: - logging.error(f"Perl script ({scripts_dir}/refseq_coordinate_parser.pl) failed with error: {cmd_output.stderr.decode('utf-8')}") + logging.error(f"Perl script ({scripts_dir}/refseq_coordinate_parser.pl) failed with error: {cmd_output.stderr}") return 1, "Failed to add refseq_import xrefs." return 0, "Added refseq_import xrefs." diff --git a/src/python/test/xrefs/flatfiles/sources_download.json b/src/python/test/xrefs/flatfiles/sources_download.json new file mode 100644 index 000000000..1b45a2acb --- /dev/null +++ b/src/python/test/xrefs/flatfiles/sources_download.json @@ -0,0 +1,16 @@ +[ + { + "name" : "ArrayExpress", + "parser" : "ArrayExpressParser", + "file" : "Database", + "db" : "core", + "priority" : 1 + }, + { + "name" : "RNACentral", + "parser" : "ChecksumParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz", + "db" : "checksum", + "priority" : 1 + } +] \ No newline at end of file From 833158748d708a6f354275a7f576b03587496980 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 13 Jan 2025 10:12:42 +0000 Subject: [PATCH 09/12] More fixes --- nextflow/config/xref.config | 6 +++++- src/python/ensembl/production/xrefs/Base.py | 15 +++++++++++++-- .../production/xrefs/mappers/CoordinateMapper.py | 1 + .../production/xrefs/mappers/DisplayXrefs.py | 5 +++-- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config index 2518e806e..2d3047bf3 100644 --- a/nextflow/config/xref.config +++ b/nextflow/config/xref.config @@ -14,8 +14,8 @@ params.sources_config_file = "${params.work_dir}/ensembl-production/src/python/e params.source_db_url = '' params.skip_download = 0 params.reuse_db = 0 -params.split_files_by_species = 1 params.tax_ids_file = '' +params.tax_ids_list = '' params.update_mode = 0 params.base_path = '' @@ -69,6 +69,10 @@ profiles { memory = 1.GB } + withLabel: mem10GB { + memory = 10.GB + } + withLabel:cleanup_mem { memory = 4.GB errorStrategy = 'retry' diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py index 53a04f2d2..00972493a 100644 --- a/src/python/ensembl/production/xrefs/Base.py +++ b/src/python/ensembl/production/xrefs/Base.py @@ -892,8 +892,19 @@ def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: if not core_url: core_url = self.get_db_from_registry(species, "core", release, registry) - core_db = self.get_db_engine(core_url) - xref_db = self.get_db_engine(xref_url) + core_db = create_engine( + make_url(core_url), + isolation_level="AUTOCOMMIT", + pool_recycle=18000, + pool_pre_ping=True + ) + + xref_db = create_engine( + make_url(xref_url), + isolation_level="AUTOCOMMIT", + pool_recycle=18000, + pool_pre_ping=True + ) # Extract host and dbname from xref URL xref_url_obj = make_url(xref_url) diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py index e8aac3f77..35ee7ac80 100644 --- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py @@ -14,6 +14,7 @@ """Mapper module for processing coordinate xref data.""" +import os import subprocess import logging from datetime import datetime diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py index 964eb26cd..8bcca8a83 100644 --- a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py +++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py @@ -18,10 +18,11 @@ import re from datetime import datetime from typing import Dict, List, Tuple -from sqlalchemy import select, insert, update, delete, case, desc, func +from sqlalchemy import select, update, delete, case, desc, func from sqlalchemy.orm import aliased from sqlalchemy.engine import Connection from sqlalchemy.sql import Select +from sqlalchemy.dialects.mysql import insert from ensembl.core.models import ( Gene as GeneORM, @@ -127,7 +128,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None: ensembl_object_type=object_type, source_id=row.source_id, priority=priority, - ) + ).on_duplicate_key_update(priority=priority) ) logging.info(f"{priority} - {row.name}") From f7cf53aeaf6fff2b9269f91382ee19e3338d4a73 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 13 Jan 2025 10:13:37 +0000 Subject: [PATCH 10/12] Uniprot & Refseq are always split by species --- nextflow/workflows/xrefDownload.nf | 57 ++++-------------- nextflow/workflows/xrefProcess.nf | 58 +++++-------------- scripts/xrefs/cleanup_and_split_source.pl | 16 +++-- .../ensembl/production/xrefs/ScheduleParse.py | 27 ++++----- 4 files changed, 52 insertions(+), 106 deletions(-) diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf index 8034627ed..66dd4ee98 100644 --- a/nextflow/workflows/xrefDownload.nf +++ b/nextflow/workflows/xrefDownload.nf @@ -4,12 +4,6 @@ params.pipeline_name = 'Xref Download Pipeline' params.help = false -// Ensure all paths are absolute -params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString() -params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString() -params.base_path = file(params.base_path).toAbsolutePath().toString() -params.clean_dir = file(params.clean_dir).toAbsolutePath().toString() - println """\ XREF DOWNLOAD PIPELINE ====================== @@ -18,10 +12,10 @@ println """\ reuse_db : ${params.reuse_db} skip_download : ${params.skip_download} clean_files : ${params.clean_files} - split_files_by_species : ${params.split_files_by_species} config_file : ${params.config_file} clean_dir : ${params.clean_dir} tax_ids_file : ${params.tax_ids_file} + tax_ids_list : ${params.tax_ids_list} update_mode : ${params.update_mode} """ .stripIndent() @@ -45,9 +39,6 @@ def helpMessage() { --clean_files (optional) If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files. Default: 1 - --split_files_by_species (optional) If set to 1, UniProt and RefSeq file will be split according to taxonomy ID. - Default: 1 - --config_file (optional) Path to the json file containing information about xref sources to download. Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json @@ -55,10 +46,13 @@ def helpMessage() { Default: [--base_path]/clean_files --tax_ids_file (optional) Path to the file containing the taxonomy IDs of the species to extract data for. - Used to update the data for the provided species. + Each taxonomy ID on a line. + + --tax_ids_list (optional) List of taxonomy IDs of the species to extract data for, separated by commas. + Takes precedence over --tax_ids_file. --update_mode (optional) If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs. - Only used if --tax_ids_file is set. Default: 0 + Only used if --tax_ids_file or --tax_ids_list are set. Default: 0 """.stripIndent() } @@ -89,13 +83,8 @@ workflow { ScheduleCleanup(CleanupTmpFiles.out, timestamp) Checksum(ScheduleCleanup.out[0], timestamp) - if (params.split_files_by_species) { - CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) - NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp) - } else { - CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) - NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp) - } + CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) + NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp) } process ScheduleDownload { @@ -158,7 +147,7 @@ process ScheduleCleanup { path 'dataflow_cleanup_sources.json' """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --log_timestamp $timestamp """ } @@ -195,7 +184,9 @@ process CleanupSplitSource { version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] cmd_params = "${cmd_params} --version_file '${version_file}'" } - if (params.tax_ids_file) { + if (params.tax_ids_list) { + cmd_params = "${cmd_params} --tax_ids_list ${params.tax_ids_list}" + } else if (params.tax_ids_file) { cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}" } @@ -204,30 +195,6 @@ process CleanupSplitSource { """ } -process CleanupSource { - label 'cleanup_mem' - tag "$src_name" - - input: - val x - val timestamp - - output: - val 'CleanupDone' - - shell: - cmd_params = "" - src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] - if (x =~ /"version_file":/) { - version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] - cmd_params = "${cmd_params} --version_file '${version_file}'" - } - - """ - perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params - """ -} - process NotifyByEmail { label 'small_process' diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf index 8ae1d8c19..58788f082 100644 --- a/nextflow/workflows/xrefProcess.nf +++ b/nextflow/workflows/xrefProcess.nf @@ -4,11 +4,6 @@ params.pipeline_name = 'Xref Process Pipeline' params.help = false -// Ensure all paths are absolute -params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString() -params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString() -params.base_path = file(params.base_path).toAbsolutePath().toString() - println """\ XREF PROCESS PIPELINE ====================== @@ -21,7 +16,6 @@ println """\ species : ${params.species} antispecies : ${params.antispecies} division : ${params.division} - split_files_by_species : ${params.split_files_by_species} sources_config_file : ${params.sources_config_file} registry_file : ${params.registry_file} dc_config_file : ${params.dc_config_file} @@ -57,9 +51,6 @@ def helpMessage() { --division (optional) Comma-separated list of divisions to run pipeline on. Will be disregarded if --run_all is set to 1. - --split_files_by_species (optional) If set to 1, UniProt and RefSeq file will be split according to taxonomy ID. - Default: 1 - --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini @@ -175,7 +166,10 @@ workflow species_flow { // Run datachecks RunXrefCriticalDatacheck(Mapping.out) RunXrefAdvisoryDatacheck(RunXrefCriticalDatacheck.out) - advisory_report_ch = process_output(RunXrefAdvisoryDatacheck.out) + + dataflow_combined = RunXrefAdvisoryDatacheck.out.dataflow_success + .mix(RunXrefAdvisoryDatacheck.out.dataflow_fail) + advisory_report_ch = process_output(dataflow_combined) // Collect advisory datacheck outputs AdvisoryXrefReport(advisory_report_ch, timestamp) @@ -248,14 +242,8 @@ process ScheduleParse { tuple val(species_name), path('dataflow_primary_sources.json') tuple val(species_name), path('dataflow_schedule_secondary.json') - shell: - cmd_params = "" - if (params.split_files_by_species) { - cmd_params = "${cmd_params} --get_species_file 1" - } - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 1 --sources_config_file ${params.sources_config_file} --source_db_url ${params.source_db_url} --xref_db_url ${params.xref_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 1 --sources_config_file ${params.sources_config_file} --source_db_url ${params.source_db_url} --xref_db_url ${params.xref_db_url} --base_path ${params.base_path} --log_timestamp $timestamp """ } @@ -291,14 +279,8 @@ process ScheduleSecondaryParse { tuple val(species_name), path('dataflow_secondary_sources.json') tuple val(species_name), path('dataflow_schedule_tertiary.json') - shell: - cmd_params = "" - if (params.split_files_by_species) { - cmd_params = "${cmd_params} --get_species_file 1" - } - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 2 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 2 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp """ } @@ -334,14 +316,8 @@ process ScheduleTertiaryParse { tuple val(species_name), path('dataflow_tertiary_sources.json') tuple val(species_name), path('dataflow_dump_ensembl.json') - shell: - cmd_params = "" - if (params.split_files_by_species) { - cmd_params = "${cmd_params} --get_species_file 1" - } - """ - python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 3 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 3 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp """ } @@ -365,7 +341,7 @@ process ParseTertiarySource { } process DumpEnsembl { - label 'default_process' + label 'mem10GB' tag "$species_name" input: @@ -386,7 +362,7 @@ process DumpEnsembl { } process DumpXref { - label 'mem1GB' + label 'mem4GB' tag "$species_name" input: @@ -439,7 +415,7 @@ process Alignment { } process ScheduleMapping { - label 'small_process' + label 'mem1GB' tag "$species_name" input: @@ -564,7 +540,7 @@ process RunXrefCriticalDatacheck { val species_name """ - perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_names='ForeignKeys' -datacheck_groups='xref_mapping' -datacheck_types='critical' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=1 -species=$species_name + perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_names='ForeignKeys' -datacheck_groups='xref_mapping' -datacheck_types='critical' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -failures_fatal=1 -species=$species_name """ } @@ -576,16 +552,17 @@ process RunXrefAdvisoryDatacheck { val species_name output: - tuple val(species_name), path('dataflow_4.json') + tuple val(species_name), path('dataflow_3.json'), emit: dataflow_success, optional: true + tuple val(species_name), path('dataflow_4.json'), emit: dataflow_fail, optional: true """ - perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_groups='xref_mapping' -datacheck_types='advisory' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=0 -species=$species_name + perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_groups='xref_mapping' -datacheck_types='advisory' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -failures_fatal=0 -species=$species_name """ } process AdvisoryXrefReport { label 'default_process' - tag "$species_name - $dc_name" + tag "$species_name" input: tuple val(species_name), val(dataflow) @@ -594,9 +571,6 @@ process AdvisoryXrefReport { output: val species_name - shell: - dc_name = (dataflow =~ /"datacheck_name":\s*"([A-Za-z]+)"/)[0][1] - script: formatted_dataflow = dataflow.replace("'", '__') """ @@ -629,4 +603,4 @@ process NotifyByEmail { """ python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp """ -} \ No newline at end of file +} diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl index 0b956a31d..f1e09dde6 100644 --- a/scripts/xrefs/cleanup_and_split_source.pl +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -28,7 +28,7 @@ use Nextflow::Utils; -my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $update_mode, $log_timestamp); +my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $tax_ids_list, $update_mode, $log_timestamp); GetOptions( 'base_path=s' => \$base_path, 'source_db_url=s' => \$source_db_url, @@ -37,6 +37,7 @@ 'clean_files=i' => \$clean_files, 'version_file:s' => \$version_file, 'tax_ids_file:s' => \$tax_ids_file, + 'tax_ids_list:s' => \$tax_ids_list, 'update_mode:i' => \$update_mode, 'log_timestamp:s' => \$log_timestamp ); @@ -56,6 +57,7 @@ add_to_log_file($log_file, "CleanupSplitSource starting for source $source_name"); add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file") if $tax_ids_file; + add_to_log_file($log_file, "Param: tax_ids_list = $tax_ids_list") if $tax_ids_list; } # Do nothing if not a uniprot or refseq source @@ -100,13 +102,19 @@ # Extract taxonomy IDs my %tax_ids; -my ($skipped_species, $added_species) = (0, 0); -if ($tax_ids_file && $update_mode) { +if ($tax_ids_list) { + $tax_ids_list =~ s/\s*,\s*/,/g; + %tax_ids = map { $_ => 1} split(",", $tax_ids_list); +} elsif ($tax_ids_file) { open my $fh, '<', $tax_ids_file or die "Couldn't open tax_ids_file '$tax_ids_file' $!"; chomp(my @lines = <$fh>); close $fh; %tax_ids = map { $_ => 1 } @lines; +} +my $tax_ids_filter = ($tax_ids_file || $tax_ids_list ? 1 : 0); +my ($skipped_species, $added_species) = (0, 0); +if ($tax_ids_filter && $update_mode) { # Check if any taxonomy IDs already have files foreach my $tax_id (keys %tax_ids) { my @tax_files = glob(catfile($output_path, "**", "**", "**", "**", "$output_file_name-$tax_id")); @@ -165,7 +173,7 @@ # Only continue with wanted species next unless $species_id; - next if $tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id}); + next if $tax_ids_filter && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id}); # Clean up data if ($clean_files) { diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py index 2317025f3..c9106b672 100644 --- a/src/python/ensembl/production/xrefs/ScheduleParse.py +++ b/src/python/ensembl/production/xrefs/ScheduleParse.py @@ -37,7 +37,6 @@ def run(self): order_priority: int = self.get_param("priority", {"required": True, "type": int}) source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str}) xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str}) - get_species_file: bool = self.get_param("get_species_file", {"required": True, "type": bool}) core_db_url: Optional[str] = self.get_param("species_db", {"type": str}) logging.info(f"ScheduleParse starting for species '{species_name}'") @@ -194,18 +193,17 @@ def run(self): ) # For Uniprot and Refseq, files might have been split by species - if get_species_file: - file_prefix = { - "Uniprot/SWISSPROT": "uniprot_sprot", - "Uniprot/SPTREMBL": "uniprot_trembl", - "RefSeq_dna": "refseq_rna", - "RefSeq_peptide": "refseq_protein", - }.get(source.name) - - if file_prefix: - list_files = glob.glob( - f"{file_name}/**/{file_prefix}-{species_id}", recursive=True - ) + file_prefix = { + "Uniprot/SWISSPROT": "uniprot_sprot", + "Uniprot/SPTREMBL": "uniprot_trembl", + "RefSeq_dna": "refseq_rna", + "RefSeq_peptide": "refseq_protein", + }.get(source.name) + + if file_prefix: + list_files = glob.glob( + f"{file_name}/**/{file_prefix}-{species_id}", recursive=True + ) if source.name == "ZFIN_ID": list_files = [list_files[0]] @@ -218,8 +216,7 @@ def run(self): dataflow_params["file_name"] = file if re.search(r"^Uniprot", source.name) and hgnc_path: - hgnc_files = glob.glob(os.path.join(hgnc_path, "*")) - dataflow_params["hgnc_file"] = hgnc_files[0] + dataflow_params["hgnc_file"] = hgnc_path self.write_output(dataflow_suffix, dataflow_params) total_sources += 1 From 95b107a12c28932f8f719b7b258f7fe485742191 Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Mon, 13 Jan 2025 11:25:11 +0000 Subject: [PATCH 11/12] Add --config_file parameter to list of params --- nextflow/workflows/xrefProcess.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf index 58788f082..3136fc4c8 100644 --- a/nextflow/workflows/xrefProcess.nf +++ b/nextflow/workflows/xrefProcess.nf @@ -16,6 +16,7 @@ println """\ species : ${params.species} antispecies : ${params.antispecies} division : ${params.division} + config_file : ${params.config_file} sources_config_file : ${params.sources_config_file} registry_file : ${params.registry_file} dc_config_file : ${params.dc_config_file} @@ -51,6 +52,9 @@ def helpMessage() { --division (optional) Comma-separated list of divisions to run pipeline on. Will be disregarded if --run_all is set to 1. + --config_file (optional) Path to the json file containing information about xref sources to download. + Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json + --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini From 5ba71f5d470465c8e94f6dad6d36ad2bbf860f3f Mon Sep 17 00:00:00 2001 From: Tamara El Naboulsi Date: Tue, 21 Jan 2025 14:38:00 +0000 Subject: [PATCH 12/12] Minor email notification update + coordinate mapper fix --- .../ensembl/production/xrefs/Checksum.py | 6 +- .../production/xrefs/EmailNotification.py | 58 +++++++++++++------ .../production/xrefs/ScheduleCleanup.py | 8 +-- .../production/xrefs/ScheduleDownload.py | 6 +- .../xrefs/mappers/CoordinateMapper.py | 4 +- 5 files changed, 51 insertions(+), 31 deletions(-) diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py index 2d990cf70..20aef07ab 100644 --- a/src/python/ensembl/production/xrefs/Checksum.py +++ b/src/python/ensembl/production/xrefs/Checksum.py @@ -28,9 +28,9 @@ def run(self): skip_download: bool = self.get_param("skip_download", {"required": True, "type": bool}) logging.info("Checksum starting with parameters:") - logging.info(f"Param: base_path = {base_path}") - logging.info(f"Param: source_db_url = {source_db_url}") - logging.info(f"Param: skip_download = {skip_download}") + logging.info(f"\tParam: base_path = {base_path}") + logging.info(f"\tParam: source_db_url = {source_db_url}") + logging.info(f"\tParam: skip_download = {skip_download}") # Connect to source db db_engine = self.get_db_engine(source_db_url) diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py index 932b0c1b7..0acae8b6d 100644 --- a/src/python/ensembl/production/xrefs/EmailNotification.py +++ b/src/python/ensembl/production/xrefs/EmailNotification.py @@ -58,8 +58,8 @@ def run(self): sources_data, added_species, skipped_species = self.extract_download_statistics(data) email_message += self.format_download_statistics(sources_data, added_species, skipped_species) elif re.search("Process", pipeline_name): - parsed_sources, species_counts = self.extract_process_statistics(data) - email_message += self.format_process_statistics(parsed_sources, species_counts) + parsed_sources, absolute_sources, species_counts = self.extract_process_statistics(data) + email_message += self.format_process_statistics(parsed_sources, species_counts, absolute_sources) # Send email self.send_email(email_address, email_server, pipeline_name, email_message) @@ -117,8 +117,8 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str: return main_log_file def extract_parameters(self, data: str) -> Dict[str, str]: - parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data) - return {param[0]: param[1] for param in parameters_list} + parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tParam: (\w+) = (.*)", data) + return {param[0]: param[1] for param in parameters_list if param[0] != 'order_priority'} def format_parameters(self, parameters: Dict[str, str]) -> str: message = "
Run Parameters
" @@ -217,14 +217,15 @@ def format_download_statistics(self, sources_data: Dict[str, Dict[str, Any]], ad return message - def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, int]]]: - parsed_sources = self.extract_parsed_sources(data) + def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, bool], Dict[str, Dict[str, int]]]: + parsed_sources, absolute_sources = self.extract_parsed_sources(data) species_counts = self.extract_species_counts(data) - return parsed_sources, species_counts + return parsed_sources, absolute_sources, species_counts - def extract_parsed_sources(self, data: str) -> Dict[str, Dict[str, str]]: + def extract_parsed_sources(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, bool]]: parsed_sources = {} + absolute_sources = {} matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'", data) for species in matches_list: @@ -232,8 +233,9 @@ def extract_parsed_sources(self, data: str) -> Dict[str, Dict[str, str]]: if species_name not in parsed_sources: parsed_sources[species_name] = {} parsed_sources[species_name][source_name] = parser + absolute_sources[source_name] = True - return parsed_sources + return parsed_sources, absolute_sources def extract_species_counts(self, data: str) -> Dict[str, Dict[str, int]]: species_counts = {} @@ -258,18 +260,36 @@ def extract_species_counts(self, data: str) -> Dict[str, Dict[str, int]]: return species_counts - def format_process_statistics(self, parsed_sources: Dict[str, Dict[str, str]], species_counts: Dict[str, Dict[str, int]]) -> str: - message = "
--Species Statistics--
" + def format_process_statistics(self, parsed_sources: Dict[str, Dict[str, str]], species_counts: Dict[str, Dict[str, int]], absolute_sources: Dict[str, bool]) -> str: + cell_style = 'style="border-right: 1px solid #000; padding: 5px;"' + + message = "
Source Statistics
" + message += f"" + for source_name in sorted(absolute_sources): + message += f"" + message += f"" for species_name, species_data in parsed_sources.items(): - message += f"{species_name}:
" - message += f"{self.INDENT}Sources parsed: " + ",".join(species_data.keys()) + "
" - - xref_counts = species_counts[species_name] - message += f"{self.INDENT}Xrefs added: " - for xref_type, count in xref_counts.items(): - message += f"{count} {xref_type} " - message += "
" + message += f"" + for source_name in sorted(absolute_sources): + message += f"" if source_name in species_data else f"" + message += "" + message += "
Species{source_name}
{species_name}X
" + + message += "
Xref Data Statistics
" + message += f"" + message += f"" + + for species_name, species_data in species_counts.items(): + message += f"" + message += f"" + message += f"" + message += f"" + message += f"" + message += f"" + message += f"" + message += "" + message += "
SpeciesDIRECTDEPENDENTINFERRED_PAIRCHECKSUMSEQUENCE_MATCHMISC
{species_name}{species_data['DIRECT']}{species_data['DEPENDENT']}{species_data['INFERRED_PAIR']}{species_data['CHECKSUM']}{species_data['SEQUENCE_MATCH']}{species_data['MISC']}
" return message diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py index 9ec5a8b7e..33a60d065 100644 --- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py +++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py @@ -35,10 +35,10 @@ def run(self): clean_dir: Optional[str] = self.get_param("clean_dir", {"type": str}) logging.info("ScheduleCleanup starting with parameters:") - logging.info(f"Param: base_path = {base_path}") - logging.info(f"Param: source_db_url = {source_db_url}") - logging.info(f"Param: clean_files = {clean_files}") - logging.info(f"Param: clean_dir = {clean_dir}") + logging.info(f"\tParam: base_path = {base_path}") + logging.info(f"\tParam: source_db_url = {source_db_url}") + logging.info(f"\tParam: clean_files = {clean_files}") + logging.info(f"\tParam: clean_dir = {clean_dir}") # Connect to source db db_engine = self.get_db_engine(source_db_url) diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py index 10b2a32af..a49feb59d 100644 --- a/src/python/ensembl/production/xrefs/ScheduleDownload.py +++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py @@ -26,9 +26,9 @@ def run(self) -> None: reuse_db: bool = self.get_param("reuse_db", {"required": True, "type": bool}) logging.info("ScheduleDownload starting with parameters:") - logging.info(f"Param: config_file = {config_file}") - logging.info(f"Param: source_db_url = {source_db_url}") - logging.info(f"Param: reuse_db = {reuse_db}") + logging.info(f"\tParam: config_file = {config_file}") + logging.info(f"\tParam: source_db_url = {source_db_url}") + logging.info(f"\tParam: reuse_db = {reuse_db}") # Create the source db from url self.create_source_db(source_db_url, reuse_db) diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py index 35ee7ac80..2ed64a65f 100644 --- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py +++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py @@ -119,8 +119,8 @@ def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir: logging.info(f"Running perl script {mapper_script}") perl_cmd = [ "perl", mapper_script, - "--xref_db_url", str(self.xref()), - "--core_db_url", str(self.core()), + "--xref_db_url", str(self.xref().url), + "--core_db_url", str(self.core().url), "--species_id", str(species_id), "--output_dir", output_dir, "--analysis_id", str(analysis_id)