From 57ccb9dac583b50c78c79e35b86055acf352ee49 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@ebi.ac.uk>
Date: Mon, 14 Oct 2024 10:20:04 +0100
Subject: [PATCH 01/12] New python/nextflow processing xref pipeline

---
 nextflow/config/xref.config                   |   68 +-
 nextflow/workflows/xrefDownload.nf            |  312 +--
 nextflow/workflows/xrefProcess.nf             |  627 ++++++
 scripts/xrefs/cleanup_and_split_source.pl     |    4 +-
 scripts/xrefs/cleanup_source.pl               |    2 +-
 scripts/xrefs/coordinate_mapper.pl            |  531 +++++
 scripts/xrefs/dump_ensembl.pl                 |   86 +
 scripts/xrefs/refseq_coordinate_parser.pl     |  468 +++++
 src/python/ensembl/common/Params.py           |  436 ++--
 src/python/ensembl/common/__init__.py         |   15 +
 .../production/xrefs/AdvisoryXrefReport.py    |   39 +
 .../ensembl/production/xrefs/Alignment.py     |   91 +
 src/python/ensembl/production/xrefs/Base.py   | 1780 +++++++++--------
 .../ensembl/production/xrefs/Checksum.py      |   54 +-
 .../production/xrefs/CoordinateMapping.py     |   50 +
 .../ensembl/production/xrefs/DirectXrefs.py   |   39 +
 .../production/xrefs/DownloadSource.py        |   96 +-
 .../ensembl/production/xrefs/DumpEnsembl.py   |   81 +
 .../ensembl/production/xrefs/DumpXref.py      |  135 ++
 .../xrefs/EmailAdvisoryXrefReport.py          |  100 +
 .../production/xrefs/EmailNotification.py     |  390 +++-
 .../ensembl/production/xrefs/Mapping.py       |   91 +
 .../ensembl/production/xrefs/ParseSource.py   |   90 +
 .../production/xrefs/ProcessAlignment.py      |   37 +
 .../production/xrefs/RNACentralMapping.py     |   62 +
 .../production/xrefs/ScheduleAlignment.py     |   73 +
 .../production/xrefs/ScheduleCleanup.py       |   82 +-
 .../production/xrefs/ScheduleDownload.py      |   90 +-
 .../production/xrefs/ScheduleMapping.py       |   56 +
 .../ensembl/production/xrefs/ScheduleParse.py |  219 ++
 .../production/xrefs/ScheduleSpecies.py       |  178 ++
 .../production/xrefs/UniParcMapping.py        |   62 +
 .../ensembl/production/xrefs/__init__.py      |   15 +
 .../xrefs/config/xref_all_sources.json        |   58 +-
 .../production/xrefs/config/xref_config.ini   |   35 +-
 .../production/xrefs/mappers/BasicMapper.py   |  432 ++++
 .../xrefs/mappers/ChecksumMapper.py           |  111 +
 .../xrefs/mappers/CoordinateMapper.py         |  130 ++
 .../production/xrefs/mappers/CoreInfo.py      |  320 +++
 .../xrefs/mappers/DirectXrefsMapper.py        |  182 ++
 .../production/xrefs/mappers/DisplayXrefs.py  |  871 ++++++++
 .../xrefs/mappers/OfficialNaming.py           |  637 ++++++
 .../xrefs/mappers/ProcessMappings.py          |  382 ++++
 .../production/xrefs/mappers/ProcessMoves.py  |  478 +++++
 .../production/xrefs/mappers/ProcessPaired.py |  248 +++
 .../xrefs/mappers/ProcessPriorities.py        |  408 ++++
 .../xrefs/mappers/RNACentralMapper.py         |   28 +
 .../production/xrefs/mappers/TestMappings.py  |  199 ++
 .../production/xrefs/mappers/UniParcMapper.py |   28 +
 .../production/xrefs/mappers/XrefLoader.py    |  804 ++++++++
 .../production/xrefs/mappers/__init__.py      |   15 +
 .../xrefs/mappers/methods/ChecksumBasic.py    |   91 +
 .../xrefs/mappers/methods/MySQLChecksum.py    |   48 +
 .../xrefs/mappers/methods/__init__.py         |   15 +
 .../xrefs/mappers/species/__init__.py         |   15 +
 .../xrefs/mappers/species/aedes_aegypti.py    |   39 +
 .../mappers/species/anopheles_gambiae.py      |   42 +
 .../mappers/species/culex_quinquefasciatus.py |   49 +
 .../xrefs/mappers/species/danio_rerio.py      |   30 +
 .../xrefs/mappers/species/drosophila.py       |   44 +
 .../xrefs/mappers/species/eukaryota.py        |  277 +++
 .../xrefs/mappers/species/homo_sapiens.py     |   29 +
 .../mappers/species/ixodes_scapularis.py      |   42 +
 .../xrefs/mappers/species/mus_musculus.py     |   29 +
 .../mappers/species/neurospora_crassa.py      |   33 +
 .../xrefs/mappers/species/parasite.py         |   46 +
 .../mappers/species/rattus_norvegicus.py      |   29 +
 .../species/saccharomyces_cerevisiae.py       |   41 +
 .../xrefs/mappers/species/sars_cov_2.py       |  131 ++
 .../species/schizosaccharomyces_pombe.py      |   41 +
 .../xrefs/mappers/species/sus_scrofa.py       |   29 +
 .../xrefs/mappers/species/wormbase.py         |  124 ++
 .../xrefs/parsers/ArrayExpressParser.py       |  161 ++
 .../production/xrefs/parsers/BaseParser.py    |  972 +++++++++
 .../production/xrefs/parsers/CCDSParser.py    |  101 +
 .../production/xrefs/parsers/DBASSParser.py   |  114 ++
 .../xrefs/parsers/EntrezGeneParser.py         |  120 ++
 .../production/xrefs/parsers/HGNCParser.py    |  421 ++++
 .../production/xrefs/parsers/HPAParser.py     |   74 +
 .../xrefs/parsers/JGI_ProteinParser.py        |   60 +
 .../production/xrefs/parsers/MGIParser.py     |   72 +
 .../xrefs/parsers/MGI_CCDS_Parser.py          |  107 +
 .../xrefs/parsers/MGI_Desc_Parser.py          |  101 +
 .../production/xrefs/parsers/MIMParser.py     |  159 ++
 .../xrefs/parsers/Mim2GeneParser.py           |  170 ++
 .../production/xrefs/parsers/RFAMParser.py    |  193 ++
 .../production/xrefs/parsers/RGDParser.py     |  154 ++
 .../xrefs/parsers/ReactomeParser.py           |  189 ++
 .../xrefs/parsers/RefSeqCoordinateParser.py   |   96 +
 .../xrefs/parsers/RefSeqGPFFParser.py         |  341 ++++
 .../production/xrefs/parsers/UCSCParser.py    |  136 ++
 .../production/xrefs/parsers/UniProtParser.py |  452 +++++
 .../production/xrefs/parsers/VGNCParser.py    |   93 +
 .../xrefs/parsers/XenopusJamboreeParser.py    |   76 +
 .../xrefs/parsers/ZFINDescParser.py           |   62 +
 .../production/xrefs/parsers/ZFINParser.py    |  169 ++
 .../production/xrefs/parsers/__init__.py      |   15 +
 .../production/xrefs/parsers/miRBaseParser.py |  113 ++
 98 files changed, 16237 insertions(+), 1533 deletions(-)
 create mode 100644 nextflow/workflows/xrefProcess.nf
 create mode 100644 scripts/xrefs/coordinate_mapper.pl
 create mode 100644 scripts/xrefs/dump_ensembl.pl
 create mode 100644 scripts/xrefs/refseq_coordinate_parser.pl
 create mode 100644 src/python/ensembl/common/__init__.py
 create mode 100644 src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
 create mode 100644 src/python/ensembl/production/xrefs/Alignment.py
 create mode 100644 src/python/ensembl/production/xrefs/CoordinateMapping.py
 create mode 100644 src/python/ensembl/production/xrefs/DirectXrefs.py
 create mode 100644 src/python/ensembl/production/xrefs/DumpEnsembl.py
 create mode 100644 src/python/ensembl/production/xrefs/DumpXref.py
 create mode 100644 src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py
 create mode 100644 src/python/ensembl/production/xrefs/Mapping.py
 create mode 100644 src/python/ensembl/production/xrefs/ParseSource.py
 create mode 100644 src/python/ensembl/production/xrefs/ProcessAlignment.py
 create mode 100644 src/python/ensembl/production/xrefs/RNACentralMapping.py
 create mode 100644 src/python/ensembl/production/xrefs/ScheduleAlignment.py
 create mode 100644 src/python/ensembl/production/xrefs/ScheduleMapping.py
 create mode 100644 src/python/ensembl/production/xrefs/ScheduleParse.py
 create mode 100644 src/python/ensembl/production/xrefs/ScheduleSpecies.py
 create mode 100644 src/python/ensembl/production/xrefs/UniParcMapping.py
 create mode 100644 src/python/ensembl/production/xrefs/__init__.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/BasicMapper.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/CoreInfo.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessMappings.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessMoves.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessPaired.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/TestMappings.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/UniParcMapper.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/XrefLoader.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/__init__.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/methods/__init__.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/__init__.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/drosophila.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/eukaryota.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/parasite.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py
 create mode 100644 src/python/ensembl/production/xrefs/mappers/species/wormbase.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/BaseParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/CCDSParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/DBASSParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/HGNCParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/HPAParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/MGIParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/MIMParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/RFAMParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/RGDParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/UCSCParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/UniProtParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/VGNCParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/ZFINParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/__init__.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/miRBaseParser.py

diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config
index 024f80e68..a7cef685e 100644
--- a/nextflow/config/xref.config
+++ b/nextflow/config/xref.config
@@ -36,50 +36,40 @@ report {
 }
 
 profiles {
+    slurm {
+        process {
+            errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
+            executor = 'slurm'
+            queue = 'production'
+            queueSize = 300
+            maxRetries = 2
+            time = '1d'
+            memory = 100.MB
 
-  lsf {
-   process {
-    errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
-    executor = 'lsf'
-    queue = 'production'
-    queueSize = 100
-    maxRetries = 3
-     withLabel:small_process {
-        memory = 200.MB
-        //very specific to lsf
-        executor.perTaskReserve = 200.MB
-    }
-    withLabel: dm {
-        queue = 'datamover'
-        time = '2h'
-    }
-  }
- }
+            withLabel:small_process {
+                memory = 200.MB
+            }
 
-  slurm {
-   process {
-    errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
-    executor = 'slurm'
-    queue = 'production'
-    queueSize = 100
-    maxRetries = 3
-    time = '1d'
+            withLabel: dm {
+                queue = 'datamover'
+                memory = 2.GB
+            }
 
-    withLabel:small_process {
-        memory = 200.MB
-    }
+            withLabel:mem1GB {
+                memory = 1.GB
+            }
 
-    withLabel: dm {
-        queue = 'datamover'
-        time = '3h'
-        memory = 2.GB
-    }
-    withLabel:mem4GB {
-        time = '5d'
-        memory = 4.GB
+            withLabel:mem4GB {
+                memory = 4.GB
+            }
+
+            withLabel:align_mem {
+                errorStrategy = 'retry'
+                maxRetries = 5
+                memory = { task.attempt <= 5 ? 4.GB * (task.attempt * task.attempt) : 16.GB  }
+            }
+        }
     }
-  }
- }
 }
 
 
diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf
index 65e255fda..e87458735 100644
--- a/nextflow/workflows/xrefDownload.nf
+++ b/nextflow/workflows/xrefDownload.nf
@@ -23,221 +23,221 @@ println """\
         .stripIndent()
 
 def helpMessage() {
-  log.info"""
-  Usage:
-  nextflow run ensembl-production/xrefDownload.nf <ARGUMENTS>
-    --source_db_url             (mandatory)     Database URL to store information about xref sources.
-                                                Syntax: 'mysql://user:password@host:port/dbname'
+    log.info"""
+    Usage:
+    nextflow run ensembl-production/xrefDownload.nf <ARGUMENTS>
+        --source_db_url             (mandatory)     Database URL to store information about xref sources.
+                                                    Syntax: 'mysql://user:password@host:port/dbname'
 
-    --base_path                 (mandatory)     Path where log and source files will be stored,
-                                                a scratch space with sufficient storage is recommended.
+        --base_path                 (mandatory)     Path where log and source files will be stored,
+                                                    a scratch space with sufficient storage is recommended.
 
-    --reuse_db                  (optional)      If set to 1, an existing source database (specified in --source_db_url) will be reused.
-                                                Default: 0
+        --reuse_db                  (optional)      If set to 1, an existing source database (specified in --source_db_url) will be reused.
+                                                    Default: 0
 
-    --skip_download             (optional)      If set to 1, source files will only be downloaded if they don't already exist in --base_path.
-                                                Default: 0
+        --skip_download             (optional)      If set to 1, source files will only be downloaded if they don't already exist in --base_path.
+                                                    Default: 0
 
-    --skip_preparse             (optional)      If set to 1, the pre-parse step will be skipped (no central DB).
-                                                Default: 1
+        --skip_preparse             (optional)      If set to 1, the pre-parse step will be skipped (no central DB).
+                                                    Default: 1
 
-    --clean_files               (optional)      If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files.
-                                                Default: 1
+        --clean_files               (optional)      If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files.
+                                                    Default: 1
 
-    --split_files_by_species    (optional)      If set to 1, UniProt and RefSeq file will be split according to taxonomy ID.
-                                                Default: 1
+        --split_files_by_species    (optional)      If set to 1, UniProt and RefSeq file will be split according to taxonomy ID.
+                                                    Default: 1
 
-    --config_file               (optional)      Path to the json file containing information about xref sources to download.
-                                                Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json
+        --config_file               (optional)      Path to the json file containing information about xref sources to download.
+                                                    Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json
 
-    --sources_config_file       (optional)      Path to the ini file containing information about all xref sources and species/divisions.
-                                                Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini
+        --sources_config_file       (optional)      Path to the ini file containing information about all xref sources and species/divisions.
+                                                    Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini
 
-    --clean_dir                 (optional)      Path where to save the cleaned up files.
-                                                Default: [--base_path]/clean_files
+        --clean_dir                 (optional)      Path where to save the cleaned up files.
+                                                    Default: [--base_path]/clean_files
 
-    --tax_ids_file              (optional)      Path to the file containing the taxonomy IDs of the species to extract data for.
-                                                Used to update the data for the provided species.
+        --tax_ids_file              (optional)      Path to the file containing the taxonomy IDs of the species to extract data for.
+                                                    Used to update the data for the provided species.
 
-    --update_mode               (optional)      If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs.
-                                                Only used if --tax_ids_file is set. Default: 0
-  """.stripIndent()
+        --update_mode               (optional)      If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs.
+                                                    Only used if --tax_ids_file is set. Default: 0
+    """.stripIndent()
 }
 
 workflow {
-  if (params.help || !params.source_db_url || !params.base_path) {
-    helpMessage()
-
-    if (!params.source_db_url) {
-      println """
-      Missing required param source_db_url
-      """.stripIndent()
-    }
-    if (!params.base_path) {
-      println """
-      Missing required param base_path
-      """.stripIndent()
+    if (params.help || !params.source_db_url || !params.base_path) {
+        helpMessage()
+
+        if (!params.source_db_url) {
+            println """
+            Missing required param source_db_url
+            """.stripIndent()
+        }
+        if (!params.base_path) {
+            println """
+            Missing required param base_path
+            """.stripIndent()
+        }
+
+        exit 1
     }
 
-    exit 1
-  }
-
-  ScheduleDownload()
-  timestamp = ScheduleDownload.out[0]
+    ScheduleDownload()
+    timestamp = ScheduleDownload.out[0]
 
-  DownloadSource(ScheduleDownload.out[1].splitText(), timestamp)
+    DownloadSource(ScheduleDownload.out[1].splitText(), timestamp)
 
-  CleanupTmpFiles(DownloadSource.out.collect())
-  ScheduleCleanup(CleanupTmpFiles.out, timestamp)
+    CleanupTmpFiles(DownloadSource.out.collect())
+    ScheduleCleanup(CleanupTmpFiles.out, timestamp)
 
-  Checksum(ScheduleCleanup.out[0], timestamp)
-  if (params.split_files_by_species) {
-    CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
-    NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp)
-  } else {
-    CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
-    NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp)
-  }
+    Checksum(ScheduleCleanup.out[0], timestamp)
+    if (params.split_files_by_species) {
+        CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
+        NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp)
+    } else {
+        CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
+        NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp)
+    }
 }
 
 process ScheduleDownload {
-  label 'small_process'
+    label 'small_process'
 
-  output:
-  val timestamp
-  path 'dataflow_sources.json'
+    output:
+    val timestamp
+    path 'dataflow_sources.json'
 
-  script:
-  timestamp = new java.util.Date().format("yyyyMMdd_HHmmss")
+    script:
+    timestamp = new java.util.Date().format("yyyyMMdd_HHmmss")
 
-  """
-  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp
-  """
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp
+    """
 }
 
 process DownloadSource {
-  label 'dm'
-  tag "$src_name"
+    label 'dm'
+    tag "$src_name"
 
-  input:
-  val x
-  val timestamp
+    input:
+    val x
+    val timestamp
 
-  output:
-  val 'DownloadSourceDone'
+    output:
+    val 'DownloadSourceDone'
 
-  shell:
-  src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+    shell:
+    src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
 
-  """
-  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download}
-  """
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download}
+    """
 }
 
 process CleanupTmpFiles {
-  label 'small_process'
+    label 'small_process'
 
-  input:
-  val x
+    input:
+    val x
 
-  output:
-  val 'TmpCleanupDone'
+    output:
+    val 'TmpCleanupDone'
 
-  """
-  find ${params.base_path} -type f -name "*.tmp" -delete
-  """
+    """
+    find ${params.base_path} -type f -name "*.tmp" -delete
+    """
 }
 
 process ScheduleCleanup {
-  label 'small_process'
+    label 'small_process'
 
-  input:
-  val x
-  val timestamp
+    input:
+    val x
+    val timestamp
 
-  output:
-  val 'ScheduleCleanupDone'
-  path 'dataflow_cleanup_sources.json'
+    output:
+    val 'ScheduleCleanupDone'
+    path 'dataflow_cleanup_sources.json'
 
-  """
-  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp
-  """
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp
+    """
 }
 
 process Checksum {
-  label 'default_process'
+    label 'default_process'
 
-  input:
-  val x
-  val timestamp
+    input:
+    val x
+    val timestamp
 
-  output:
-  val 'ChecksumDone'
+    output:
+    val 'ChecksumDone'
 
-  """
-  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp
-  """
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp
+    """
 }
 
 process CleanupSplitSource {
-  label 'mem4GB'
-  tag "$src_name"
-
-  input:
-  each x
-  val timestamp
-
-  output:
-  val 'CleanupDone'
-
-  shell:
-  cmd_params = ""
-  src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
-  if (x =~ /"version_file":/) {
-    version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1]
-    cmd_params = "${cmd_params} --version_file '${version_file}'"
-  }
-  if (params.tax_ids_file) {
-    cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}"
-  }
-
-  """
-  perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params
-  """
+    label 'mem4GB'
+    tag "$src_name"
+
+    input:
+    each x
+    val timestamp
+
+    output:
+    val 'CleanupDone'
+
+    shell:
+    cmd_params = ""
+    src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+    if (x =~ /"version_file":/) {
+        version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1]
+        cmd_params = "${cmd_params} --version_file '${version_file}'"
+    }
+    if (params.tax_ids_file) {
+        cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}"
+    }
+
+    """
+    perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params
+    """
 }
 
 process CleanupSource {
-  label 'mem4GB'
-  tag "$src_name"
-
-  input:
-  val x
-  val timestamp
-
-  output:
-  val 'CleanupDone'
-
-  shell:
-  cmd_params = ""
-  src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
-  if (x =~ /"version_file":/) {
-    version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1]
-    cmd_params = "${cmd_params} --version_file '${version_file}'"
-  }
-
-  """
-  perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params
-  """
+    label 'mem4GB'
+    tag "$src_name"
+
+    input:
+    val x
+    val timestamp
+
+    output:
+    val 'CleanupDone'
+
+    shell:
+    cmd_params = ""
+    src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+    if (x =~ /"version_file":/) {
+        version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1]
+        cmd_params = "${cmd_params} --version_file '${version_file}'"
+    }
+
+    """
+    perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params
+    """
 }
 
 process NotifyByEmail {
-  label 'small_process'
+    label 'small_process'
 
-  input:
-  val x
-  val timestamp
+    input:
+    val x
+    val timestamp
 
-  """
-  python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
-  """
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
+    """
 }
diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf
new file mode 100644
index 000000000..02517aa60
--- /dev/null
+++ b/nextflow/workflows/xrefProcess.nf
@@ -0,0 +1,627 @@
+#!/usr/bin/env nextflow
+
+// Parameter default values
+params.pipeline_name = 'Xref Process Pipeline'
+params.help = false
+
+println """\
+        XREF PROCESS PIPELINE
+        ======================
+        release                   : ${params.release}
+        source_db_url             : ${params.source_db_url}
+        base_path                 : ${params.base_path}
+        registry_url              : ${params.registry_url}
+        xref_db_url               : ${params.xref_db_url}
+        run_all                   : ${params.run_all}
+        species                   : ${params.species}
+        antispecies               : ${params.antispecies}
+        division                  : ${params.division}
+        split_files_by_species    : ${params.split_files_by_species}
+        sources_config_file       : ${params.sources_config_file}
+        registry_file             : ${params.registry_file}
+        dc_config_file            : ${params.dc_config_file}
+        """
+        .stripIndent()
+
+def helpMessage() {
+  log.info"""
+  Usage:
+  nextflow run ensembl-production/xrefProcess.nf <ARGUMENTS>
+    --release                   (mandatory)     The Ensembl release.
+
+    --source_db_url             (mandatory)     Database URL where information about xref sources is stored (created during xrefDownload pipeline).
+                                                Syntax: 'mysql://user:password@host:port/dbname'
+
+    --base_path                 (mandatory)     Path where log and species files will be stored,
+                                                a scratch space with sufficient storage is recommended.
+
+    --registry_url              (mandatory)     Database URL on which the registry metaSearch API will be run.
+                                                Syntax: 'mysql://user:password@host:port/dbname'
+
+    --xref_db_url               (mandatory)     Database URL where the species intermediate DBs will be created.
+                                                Syntax: 'mysql://user:password@host:port/
+
+    --run_all                   (optional)      If set to 1, the pipeline will run on ALL species in registry.
+                                                Default: 0
+
+    --species                   (optional)      Comma-separated list of species to run pipeline on.
+                                                Will be disregarded if --run_all is set to 1. Takes precedence over --division.
+
+    --antispecies               (optional)      Comma-separated list of species to disregard in the run.
+
+    --division                  (optional)      Comma-separated list of divisions to run pipeline on.
+                                                Will be disregarded if --run_all is set to 1.
+
+    --split_files_by_species    (optional)      If set to 1, UniProt and RefSeq file will be split according to taxonomy ID.
+                                                Default: 1
+
+    --sources_config_file       (optional)      Path to the ini file containing information about all xref sources and species/divisions.
+                                                Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini
+
+    --registry_file             (mandatory)     Path to the registry config file (used in perl scripts).
+
+    --dc_config_file            (mandatory)     Path to the datachecks configuration file.
+  """.stripIndent()
+}
+
+workflow {
+    // Check mandatory paremeters
+    if (params.help || !params.release || !params.source_db_url || !params.base_path || !params.registry_url || !params.xref_db_url || !params.registry_file || !params.dc_config_file) {
+        helpMessage()
+
+        def required_params = [
+            'release'        : params.release,
+            'source_db_url'  : params.source_db_url,
+            'base_path'      : params.base_path,
+            'registry_url'   : params.registry_url,
+            'xref_db_url'    : params.xref_db_url,
+            'registry_file'  : params.registry_file,
+            'dc_config_file' : params.dc_config_file
+        ]
+
+        required_params.each { param_name, param_value ->
+            if (!param_value) {
+                println """
+                Missing required param '${param_name}'
+                """.stripIndent()
+            }
+        }
+
+        exit 1
+    }
+
+    // Find the species in the registry
+    ScheduleSpecies()
+    timestamp    = ScheduleSpecies.out[0]
+    species_info = ScheduleSpecies.out[1].splitText().map{it -> it.trim()}
+
+    // Run the species flow for each species
+    species_flow(species_info, timestamp)
+
+    // Send emails
+    EmailAdvisoryXrefReport(species_flow.out.collect(), timestamp)
+    NotifyByEmail(EmailAdvisoryXrefReport.out, timestamp)
+}
+
+workflow species_flow {
+    take:
+        species_dataflow
+        timestamp
+    main:
+        // Extract the species name to create tuples
+        GetSpeciesName(species_dataflow)
+
+        // Schedule primary sources to parse
+        ScheduleParse(GetSpeciesName.out, timestamp)
+        primary_sources_ch = process_output(ScheduleParse.out[0])
+        schedule_secondary_ch = process_output(ScheduleParse.out[1])
+
+        // Parse primary sources
+        ParseSource(primary_sources_ch, timestamp)
+
+        // Schedule secondary sources to parse
+        ScheduleSecondaryParse(schedule_secondary_ch, ParseSource.out.collect().count(), timestamp)
+        secondary_sources_ch = process_output(ScheduleSecondaryParse.out[0])
+        schedule_tertiary_ch = process_output(ScheduleSecondaryParse.out[1])
+
+        // Parse secondary sources
+        ParseSecondarySource(secondary_sources_ch, timestamp)
+
+        // Schedule tertiary sources to parse
+        ScheduleTertiaryParse(schedule_tertiary_ch, ParseSecondarySource.out.collect().count(), timestamp)
+        tertiary_sources_ch = process_output(ScheduleTertiaryParse.out[0])
+        dump_enembl_ch = process_output(ScheduleTertiaryParse.out[1])
+
+        // Parse tertiary sources
+        ParseTertiarySource(tertiary_sources_ch, timestamp)
+
+        // Dump ensembl sequences
+        DumpEnsembl(dump_enembl_ch, ParseTertiarySource.out.collect().count(), timestamp)
+        dump_xref_ch = process_output(DumpEnsembl.out[0])
+        schedule_mapping_ch = process_output(DumpEnsembl.out[1])
+
+        // Dump xref sequences
+        DumpXref(dump_xref_ch, timestamp)
+        schedule_alignment_ch = process_output(DumpXref.out)
+
+        // Schedule alignments
+        ScheduleAlignment(schedule_alignment_ch, timestamp)
+        alignment_ch = process_output(ScheduleAlignment.out)
+
+        // Align dumps
+        Alignment(alignment_ch, timestamp)
+
+        // Schedule mapping
+        ScheduleMapping(schedule_mapping_ch, Alignment.out.collect().count(), timestamp)
+        pre_mapping_ch = process_output(ScheduleMapping.out[0])
+        mapping_ch = process_output(ScheduleMapping.out[1])
+
+        // Start pre-mapping steps
+        DirectXrefs(pre_mapping_ch, timestamp)
+        ProcessAlignment(DirectXrefs.out, timestamp)
+
+        RnaCentralMapping(pre_mapping_ch, timestamp)
+        UniParcMapping(RnaCentralMapping.out, timestamp)
+        CoordinateMapping(UniParcMapping.out, timestamp)
+
+        // Start mapping
+        Mapping(mapping_ch, ProcessAlignment.out.concat(CoordinateMapping.out).count(), timestamp)
+
+        // Run datachecks
+        RunXrefCriticalDatacheck(Mapping.out)
+        RunXrefAdvisoryDatacheck(RunXrefCriticalDatacheck.out)
+        advisory_report_ch = process_output(RunXrefAdvisoryDatacheck.out)
+
+        // Collect advisory datacheck outputs
+        AdvisoryXrefReport(advisory_report_ch, timestamp)
+    emit:
+        AdvisoryXrefReport.out
+}
+
+def process_output(output_channel) {
+    return output_channel.flatMap { species_name, dataflow_file ->
+        def result = []
+        for (line in dataflow_file.readLines()) {
+            result << tuple(species_name, line)
+        }
+        return result
+    }
+}
+
+process ScheduleSpecies {
+    label 'small_process'
+
+    output:
+    val timestamp
+    path 'dataflow_species.json'
+
+    script:
+    timestamp = new java.util.Date().format("yyyyMMdd_HHmmss")
+
+    shell:
+    cmd_params = ""
+    if (params.species) {
+        cmd_params = "${cmd_params} --species '${params.species}'"
+    }
+    if (params.antispecies) {
+        cmd_params = "${cmd_params} --antispecies '${params.antispecies}'"
+    }
+    if (params.division) {
+        cmd_params = "${cmd_params} --division '${params.division}'"
+    }
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleSpecies --registry_url ${params.registry_url} --run_all ${params.run_all} --release ${params.release} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    """
+}
+
+process GetSpeciesName {
+    label 'small_process'
+
+    input:
+    val dataflow
+
+    output:
+    tuple val(species_name), val(dataflow)
+
+    shell:
+    species_name = (dataflow =~ /"species_name":\s*"([A-Za-z0-9_.-]+)"/)[0][1]
+
+    """
+    """
+}
+
+process ScheduleParse {
+    label 'small_process'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_primary_sources.json')
+    tuple val(species_name), path('dataflow_schedule_secondary.json')
+
+    shell:
+    cmd_params = ""
+    if (params.split_files_by_species) {
+        cmd_params = "${cmd_params} --get_species_file 1"
+    }
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 1 --sources_config_file ${params.sources_config_file} --source_db_url ${params.source_db_url} --xref_db_url ${params.xref_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    """
+}
+
+process ParseSource {
+    label 'mem1GB'
+    tag "$species_name - $source_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val 'ParseSourceDone'
+
+    shell:
+    source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ParseSource --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp
+    """
+}
+
+process ScheduleSecondaryParse {
+    label 'small_process'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val wait
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_secondary_sources.json')
+    tuple val(species_name), path('dataflow_schedule_tertiary.json')
+
+    shell:
+    cmd_params = ""
+    if (params.split_files_by_species) {
+        cmd_params = "${cmd_params} --get_species_file 1"
+    }
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 2 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    """
+}
+
+process ParseSecondarySource {
+    label 'default_process'
+    tag "$species_name - $source_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val 'ParseSecondarySourceDone'
+
+    shell:
+    source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ParseSource --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp
+    """
+}
+
+process ScheduleTertiaryParse {
+    label 'small_process'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val wait
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_tertiary_sources.json')
+    tuple val(species_name), path('dataflow_dump_ensembl.json')
+
+    shell:
+    cmd_params = ""
+    if (params.split_files_by_species) {
+        cmd_params = "${cmd_params} --get_species_file 1"
+    }
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 3 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    """
+}
+
+process ParseTertiarySource {
+    label 'mem1GB'
+    tag "$species_name - $source_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val 'ParseTertiarySourceDone'
+
+    shell:
+    source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ParseSource --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp
+    """
+}
+
+process DumpEnsembl {
+    label 'default_process'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val wait
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_dump_xref.json')
+    tuple val(species_name), path('dataflow_schedule_mapping.json')
+
+    script:
+    def retry_flag = task.attempt > 1 ? "--retry 1" : ""
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DumpEnsembl --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --perl_scripts_dir ${params.perl_scripts_dir} $retry_flag --log_timestamp $timestamp
+    """
+}
+
+process DumpXref {
+    label 'mem1GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_schedule_alignment.json')
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DumpXref --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --config_file ${params.config_file} --log_timestamp $timestamp
+    """
+}
+
+process ScheduleAlignment {
+    label 'small_process'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_alignment.json')
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleAlignment --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --log_timestamp $timestamp
+    """
+}
+
+process Alignment {
+    label 'align_mem'
+    tag "$species_name - $source_name ($source_id) - chunk $chunk"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val 'AlignmentDone'
+
+    shell:
+    source_name = (dataflow =~ /"source_name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
+    source_id = (dataflow =~ /"source_id":\s*([0-9]+)/)[0][1]
+    chunk = (dataflow =~ /"chunk":\s*([0-9]+)/)[0][1]
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Alignment --dataflow '$dataflow' --base_path ${params.base_path} --log_timestamp $timestamp
+    """
+}
+
+process ScheduleMapping {
+    label 'small_process'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val wait
+    val timestamp
+
+    output:
+    tuple val(species_name), path('dataflow_pre_mapping.json')
+    tuple val(species_name), path('dataflow_mapping.json')
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --log_timestamp $timestamp
+    """
+}
+
+process DirectXrefs {
+    label 'mem1GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    tuple val(species_name), val(dataflow)
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DirectXrefs --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --log_timestamp $timestamp
+    """
+}
+
+process ProcessAlignment {
+    label 'mem1GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val 'ProcessAlignmentDone'
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ProcessAlignment --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --log_timestamp $timestamp
+    """
+}
+
+process RnaCentralMapping {
+    label 'mem1GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    tuple val(species_name), val(dataflow)
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.RNACentralMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --source_db_url ${params.source_db_url} --log_timestamp $timestamp
+    """
+}
+
+process UniParcMapping {
+    label 'mem1GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    tuple val(species_name), val(dataflow)
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.UniParcMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --source_db_url ${params.source_db_url} --log_timestamp $timestamp
+    """
+}
+
+process CoordinateMapping {
+    label 'mem1GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val 'CoordinateMappingDone'
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.CoordinateMapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --source_db_url ${params.source_db_url} --perl_scripts_dir ${params.perl_scripts_dir} --log_timestamp $timestamp
+    """
+}
+
+process Mapping {
+    label 'mem4GB'
+    tag "$species_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val wait
+    val timestamp
+
+    output:
+    val species_name
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Mapping --dataflow '$dataflow' --release ${params.release} --base_path ${params.base_path} --registry_url ${params.registry_url} --ignore_warnings ${params.ignore_warnings} --log_timestamp $timestamp
+    """
+}
+
+process RunXrefCriticalDatacheck {
+    label 'default_process'
+    tag "$species_name"
+
+    input:
+    val species_name
+
+    output:
+    val species_name
+
+    """
+    perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_names='ForeignKeys' -datacheck_groups='xref_mapping' -datacheck_types='critical' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=1 -species=$species_name
+    """
+}
+
+process RunXrefAdvisoryDatacheck {
+    label 'default_process'
+    tag "$species_name"
+
+    input:
+    val species_name
+
+    output:
+    tuple val(species_name), path('dataflow_4.json')
+
+    """
+    perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_groups='xref_mapping' -datacheck_types='advisory' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=0 -species=$species_name
+    """
+}
+
+process AdvisoryXrefReport {
+    label 'default_process'
+    tag "$species_name - $dc_name"
+
+    input:
+    tuple val(species_name), val(dataflow)
+    val timestamp
+
+    output:
+    val species_name
+
+    shell:
+    dc_name = (dataflow =~ /"datacheck_name":\s*"([A-Za-z]+)"/)[0][1]
+
+    script:
+    formatted_dataflow = dataflow.replace("'", '__')
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.AdvisoryXrefReport --dataflow '$formatted_dataflow' --release ${params.release} --base_path ${params.base_path} --species_name $species_name --log_timestamp $timestamp
+    """
+}
+
+process EmailAdvisoryXrefReport {
+    label 'default_process'
+
+    input:
+    val wait
+    val timestamp
+
+    output:
+    val 'done'
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailAdvisoryXrefReport --release ${params.release} --base_path ${params.base_path} --pipeline_name '${params.pipeline_name}' --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
+    """
+}
+
+process NotifyByEmail {
+    label 'small_process'
+
+    input:
+    val wait
+    val timestamp
+
+    """
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --release ${params.release} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
+    """
+}
\ No newline at end of file
diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index 3beabbcd6..cb92281a3 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -55,7 +55,7 @@
   $log_file = catfile($log_path, "tmp_logfile_CleanupSplitSource_".int(rand(500)));
 
   add_to_log_file($log_file, "CleanupSplitSource starting for source $source_name");
-  add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file");
+  add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file") if $tax_ids_file;
 }
 
 # Do nothing if not a uniprot or refseq source
@@ -288,4 +288,4 @@ sub add_to_log_file {
     print $fh "$current_timestamp | INFO | $message\n";
     close($fh);
   }
-}
+}
\ No newline at end of file
diff --git a/scripts/xrefs/cleanup_source.pl b/scripts/xrefs/cleanup_source.pl
index 5ce29a0f5..1226e6e1c 100644
--- a/scripts/xrefs/cleanup_source.pl
+++ b/scripts/xrefs/cleanup_source.pl
@@ -232,4 +232,4 @@ sub add_to_log_file {
     print $fh "$current_timestamp | INFO | $message\n";
     close($fh);
   }
-}
+}
\ No newline at end of file
diff --git a/scripts/xrefs/coordinate_mapper.pl b/scripts/xrefs/coordinate_mapper.pl
new file mode 100644
index 000000000..76c06775f
--- /dev/null
+++ b/scripts/xrefs/coordinate_mapper.pl
@@ -0,0 +1,531 @@
+#!/usr/bin/env perl
+#  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+#  Copyright [2016-2024] EMBL-European Bioinformatics Institute
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+use Data::Dumper;
+use Carp;
+use DBI;
+use JSON;
+use Getopt::Long;
+
+use Nextflow::Utils;
+use Bio::EnsEMBL::DBSQL::DBAdaptor
+use Bio::EnsEMBL::Mapper::RangeRegistry;
+
+my ($xref_db_url, $core_db_url, $species_id, $output_dir, $analysis_id);
+GetOptions(
+  'xref_db_url=s' => \$xref_db_url,
+  'core_db_url=s' => \$core_db_url,
+  'species_id=i'  => \$species_id,
+  'output_dir=s'  => \$output_dir,
+  'analysis_id=i' => \$analysis_id
+);
+
+# Check that all parameters are passed
+if (!defined($xref_db_url) || !defined($core_db_url) || !defined($species_id) || !defined($output_dir) || !defined($analysis_id)) {
+  croak "Usage: dump_ensembl.pl --xref_db_url <xref_db_url> --core_db_url <core_db_url> --species_id <species_id> --output_dir <output_dir> --analysis_id <analysis_id>";
+}
+
+# Set the files to use
+my $xref_filename = catfile($output_dir, 'xref_coord.txt');
+my $object_xref_filename = catfile($output_dir, 'object_xref_coord.txt');
+my $unmapped_reason_filename = catfile($output_dir, 'unmapped_reason_coord.txt');
+my $unmapped_object_filename = catfile($output_dir, 'unmapped_object_coord.txt');
+
+# Connect tp dbs
+my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url);
+my $core_dbi = get_dbi($core_host, $core_port, $core_user, $core_pass, $core_dbname);
+my $xref_dbi = get_dbi(parse_url($xref_db_url));
+
+# Figure out the last used IDs in the core DB
+my $xref_id = $core_dbi->selectall_arrayref('SELECT MAX(xref_id) FROM xref')->[0][0];
+my $object_xref_id = $core_dbi->selectall_arrayref('SELECT MAX(object_xref_id) FROM object_xref')->[0][0];
+my $unmapped_object_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_object_id) FROM unmapped_object')->[0][0];
+my $unmapped_reason_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_reason_id) FROM unmapped_reason')->[0][0];
+
+my (%unmapped, %mapped);
+my $external_db_id;
+
+# Read and store available Xrefs from the Xref database
+my $xref_sth = $xref_dbi->prepare("SELECT c.coord_xref_id,s.name,c.accession FROM coordinate_xref c,source s WHERE c.source_id=s.source_id AND c.species_id=?");
+$xref_sth->bind_param(1, $species_id, SQL_INTEGER);
+$xref_sth->execute();
+
+while (my $xref = $xref_sth->fetchrow_hashref()) {
+  $external_db_id ||= $core_dbi->selectall_arrayref('SELECT external_db_id FROM external_db WHERE db_name='.$xref->{'name'})->[0][0];
+  $external_db_id ||= 11000;    # FIXME (11000 is 'UCSC')
+
+  $unmapped{$xref->{'coord_xref_id'}} = {
+    'external_db_id' => $external_db_id,
+    'accession'      => $xref->{'accession'},
+    'reason'         => 'No overlap',
+    'reason_full'    => 'No coordinate overlap with any Ensembl transcript'
+  };
+}
+$xref_sth->finish();
+
+if (!defined($external_db_id)) {
+  die "External_db_id is undefined for species_id = $species_id\n";
+}
+
+# Start the coordinate matching
+my $core_db_adaptor = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
+  -host => $core_host,
+  -port => $core_port,
+  -user => $core_user,
+  -pass => $core_pass,
+  -dbname => $core_dbname,
+);
+
+my $slice_adaptor = $core_db_adaptor->get_SliceAdaptor();
+my @chromosomes   = @{ $slice_adaptor->fetch_all('Chromosome') };
+
+my $sql = qq(
+  SELECT    coord_xref_id, accession,
+            txStart, txEnd,
+            cdsStart, cdsEnd,
+            exonStarts, exonEnds
+  FROM      coordinate_xref
+  WHERE     species_id = ?
+  AND       chromosome = ? AND strand   = ?
+  AND       ((txStart BETWEEN ? AND ?)        -- txStart in region
+  OR         (txEnd   BETWEEN ? AND ?)        -- txEnd in region
+  OR         (txStart <= ? AND txEnd >= ?))   -- region is fully contained
+  ORDER BY  accession
+);
+
+foreach my $chromosome (@chromosomes) {
+  my $chr_name = $chromosome->seq_region_name();
+  my @genes = @{ $chromosome->get_all_Genes( undef, undef, 1 ) };
+
+  while (my $gene = shift(@genes)) {
+    my @transcripts = @{ $gene->get_all_Transcripts() };
+    my %gene_result;
+
+    foreach my $transcript (sort { $a->start() <=> $b->start() } @transcripts) {
+      ################################################################
+      # For each Ensembl transcript:                                 #
+      #   1. Register all Ensembl exons in a RangeRegistry.          #
+      #                                                              #
+      #   2. Find all transcripts in the external database that are  #
+      #      within the range of this Ensembl transcript.            #
+      #                                                              #
+      # For each of those external transcripts:                      #
+      #   3. Calculate the overlap of the exons of the external      #
+      #      transcript with the Ensembl exons using the             #
+      #      overlap_size() method in the RangeRegistry.             #
+      #                                                              #
+      #   4. Register the external exons in their own RangeRegistry. #
+      #                                                              #
+      #   5. Calculate the overlap of the Ensembl exons with the     #
+      #      external exons as in step 3.                            #
+      #                                                              #
+      #   6. Calculate the match score.                              #
+      #                                                              #
+      #   7. Decide whether or not to keep the match.                #
+      ################################################################
+
+      my @exons = @{ $transcript->get_all_Exons() };
+      my %transcript_result;
+
+      # '$rr1' is the RangeRegistry holding Ensembl exons for one transcript at a time.
+      my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+
+      my $coding_transcript;
+      if (defined($transcript->translation())) {
+        $coding_transcript = 1;
+      } else {
+        $coding_transcript = 0;
+      }
+
+      foreach my $exon (@exons) {
+        # Register each exon in the RangeRegistry.  Register both the
+        # total length of the exon and the coding range of the exon.
+        $rr1->check_and_register('exon', $exon->start(), $exon->end());
+
+        if ($coding_transcript
+          && defined($exon->coding_region_start($transcript))
+          && defined($exon->coding_region_end($transcript) ))
+        {
+          $rr1->check_and_register('coding', $exon->coding_region_start($transcript), $exon->coding_region_end($transcript));
+        }
+      }
+
+      # Get hold of all transcripts from the external database that
+      # overlaps with this Ensembl transcript.
+
+      my $sth = $xref_dbi->prepare_cached($sql);
+      $sth->bind_param(1, $species_id,          SQL_INTEGER);
+      $sth->bind_param(2, $chr_name,            SQL_VARCHAR);
+      $sth->bind_param(3, $gene->strand(),      SQL_INTEGER);
+      $sth->bind_param(4, $transcript->start(), SQL_INTEGER);
+      $sth->bind_param(5, $transcript->end(),   SQL_INTEGER);
+      $sth->bind_param(6, $transcript->start(), SQL_INTEGER);
+      $sth->bind_param(7, $transcript->end(),   SQL_INTEGER);
+      $sth->bind_param(8, $transcript->start(), SQL_INTEGER);
+      $sth->bind_param(9, $transcript->end(),   SQL_INTEGER);
+      $sth->execute();
+
+      my ($coord_xref_id, $accession, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonStarts, $exonEnds);
+
+      $sth->bind_columns(\($coord_xref_id, $accession, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonStarts, $exonEnds));
+
+      while ($sth->fetch()) {
+        my @exonStarts = split(/,\s*/, $exonStarts);
+        my @exonEnds   = split(/,\s*/, $exonEnds);
+        my $exonCount = scalar(@exonStarts);
+
+        # '$rr2' is the RangeRegistry holding exons from the external
+        # transcript, for one transcript at a time.
+        my $rr2 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+
+        my $exon_match   = 0;
+        my $coding_match = 0;
+        my $coding_count = 0;
+
+        for (my $i = 0 ; $i < $exonCount ; ++$i) {
+          # Register the exons from the external database in the same
+          # was as with the Ensembl exons, and calculate the overlap
+          # of the external exons with the previously registered
+          # Ensembl exons.
+
+          my $overlap = $rr1->overlap_size('exon', $exonStarts[$i], $exonEnds[$i]);
+          $exon_match += $overlap/($exonEnds[$i] - $exonStarts[$i] + 1);
+          $rr2->check_and_register('exon', $exonStarts[$i], $exonEnds[$i]);
+
+          if (!defined($cdsStart) || !defined($cdsEnd)) {
+            # Non-coding transcript.
+          } else {
+            my $codingStart = ($exonStarts[$i] > $cdsStart ? $exonStarts[$i] : $cdsStart);
+            my $codingEnd = ($exonEnds[$i] < $cdsEnd ? $exonEnds[$i] : $cdsEnd);
+
+            if ($codingStart < $codingEnd) {
+              my $coding_overlap = $rr1->overlap_size('coding', $codingStart, $codingEnd);
+              $coding_match += $coding_overlap/($codingEnd - $codingStart + 1);
+              $rr2->check_and_register('coding', $codingStart, $codingEnd);
+
+              ++$coding_count;
+            }
+          }
+        }
+
+        my $rexon_match   = 0;
+        my $rcoding_match = 0;
+        my $rcoding_count = 0;
+
+        foreach my $exon (@exons) {
+          # Calculate the overlap of the Ensembl exons with the
+          # external exons.
+
+          my $overlap = $rr2->overlap_size('exon', $exon->start(), $exon->end());
+          $rexon_match += $overlap/($exon->end() - $exon->start() + 1);
+
+          if ($coding_transcript
+            && defined($exon->coding_region_start($transcript))
+            && defined($exon->coding_region_end($transcript) ))
+          {
+            my $coding_overlap = $rr2->overlap_size('coding', $exon->coding_region_start($transcript), $exon->coding_region_end($transcript));
+
+            $rcoding_match += $coding_overlap/($exon->coding_region_end($transcript) - $exon->coding_region_start($transcript) + 1);
+
+            ++$rcoding_count;
+          }
+        }
+
+        # Calculate the match score.
+        my $score = (
+          ($exon_match + $ens_weight*$rexon_match) +
+          $coding_weight*($coding_match + $ens_weight*$rcoding_match)
+        )/
+        (
+          ($exonCount + $ens_weight*scalar(@exons)) +
+          $coding_weight*($coding_count + $ens_weight*$rcoding_count)
+        );
+
+        if (!defined( $transcript_result{$coord_xref_id}) || $transcript_result{$coord_xref_id} < $score) {
+          $transcript_result{$coord_xref_id} = $score;
+        }
+
+      }
+      $sth->finish();
+
+      # Apply transcript threshold and pick the best match(es) for
+      # this transcript.
+
+      my $best_score;
+      foreach my $coord_xref_id (sort( { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result) )) {
+        my $score = $transcript_result{$coord_xref_id};
+
+        if ($score > $transcript_score_threshold) {
+          $best_score ||= $score;
+
+          if (sprintf("%.3f", $score) eq sprintf("%.3f", $best_score)) {
+            if (exists( $unmapped{$coord_xref_id})) {
+              $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
+              delete( $unmapped{$coord_xref_id} );
+              $mapped{$coord_xref_id}{'reason'}      = undef;
+              $mapped{$coord_xref_id}{'reason_full'} = undef;
+              $mapped{$coord_xref_id}{'chr_name'} = $chr_name;
+            }
+
+            push(@{ $mapped{$coord_xref_id}{'mapped_to'}}, {
+              'ensembl_id'          => $transcript->dbID(),
+              'ensembl_object_type' => 'Transcript'
+            });
+
+            # This is now a candidate Xref for the gene.
+            if (!defined( $gene_result{$coord_xref_id}) || $gene_result{$coord_xref_id} < $score) {
+              $gene_result{$coord_xref_id} = $score;
+            }
+          } elsif (exists($unmapped{$coord_xref_id})) {
+            $unmapped{$coord_xref_id}{'reason'} = 'Was not best match';
+            $unmapped{$coord_xref_id}{'reason_full'} = sprintf("Did not top best transcript match score (%.2f)", $best_score);
+            if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) {
+              $unmapped{$coord_xref_id}{'score'} = $score;
+              $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID();
+            }
+          }
+        } elsif (exists( $unmapped{$coord_xref_id}) && $unmapped{$coord_xref_id}{'reason'} ne 'Was not best match') {
+          $unmapped{$coord_xref_id}{'reason'} = 'Did not meet threshold';
+          $unmapped{$coord_xref_id}{'reason_full'} = sprintf( "Match score for transcript lower than threshold (%.2f)", $transcript_score_threshold);
+          if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) {
+            $unmapped{$coord_xref_id}{'score'} = $score;
+            $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID();
+          }
+        }
+      }
+    }
+  }
+}
+
+# Make all dumps.  Order is important.
+dump_xref($xref_filename, $xref_id, \%mapped, \%unmapped);
+dump_object_xref($object_xref_filename, $object_xref_id, $analysis_id, \%mapped);
+dump_unmapped_reason($unmapped_reason_filename, $unmapped_reason_id, \%unmapped, $core_dbi);
+dump_unmapped_object($unmapped_object_filename, $unmapped_object_id, $analysis_id, \%unmapped);
+
+# Upload the dumps. Order is important.
+upload_data('unmapped_reason', $unmapped_reason_filename, $external_db_id, $core_dbi);
+upload_data('unmapped_object', $unmapped_object_filename, $external_db_id, $core_dbi);
+upload_data('object_xref', $object_xref_filename, $external_db_id, $core_dbi);
+upload_data('xref', $xref_filename, $external_db_id, $core_dbi);
+
+sub parse_url {
+  my ($url) = @_;
+
+  my $parsed_url = Nextflow::Utils::parse($url);
+  my $user = $parsed_url->{'user'};
+  my $pass = $parsed_url->{'pass'};
+  my $host = $parsed_url->{'host'};
+  my $port = $parsed_url->{'port'};
+  my $db   = $parsed_url->{'dbname'};
+
+  return ($host, $port, $user, $pass, $db)
+}
+
+sub get_dbi {
+  my ($host, $port, $user, $pass, $dbname) = @_;
+
+  my $dbconn;
+  if (defined $dbname) {
+    $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname);
+  } else {
+    $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
+  }
+  my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr );
+
+  return $dbi;
+}
+
+sub dump_xref {
+  my ($filename, $xref_id, $mapped, $unmapped) = @_;
+
+  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+
+  foreach my $xref (values(%{$unmapped}), values(%{$mapped})) {
+    # Assign 'xref_id' to this Xref.
+    $xref->{'xref_id'} = ++$xref_id;
+
+    my $accession = $xref->{'accession'};
+    my ($version) = ($accession =~ /\.(\d+)$/);
+    $version ||= 0;
+
+    my $info_text = (defined($xref->{'chr_name'}) && $xref->{'chr_name'} eq 'Y' ? "Y Chromosome" : "");
+
+    $fh->printf("%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n",
+      $xref->{'xref_id'},
+      $xref->{'external_db_id'},
+      $accession,
+      $accession,
+      $version,
+      '\N',
+      'COORDINATE_OVERLAP',
+      $info_text
+    );
+  }
+  $fh->close();
+}
+
+sub dump_object_xref {
+  my ($filename, $object_xref_id, $analysis_id, $mapped) = @_;
+
+  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+
+  foreach my $xref (values(%{$mapped})) {
+    foreach my $object_xref (@{ $xref->{'mapped_to'} }) {
+      # Assign 'object_xref_id' to this Object Xref.
+      $object_xref->{'object_xref_id'} = ++$object_xref_id;
+
+      $fh->printf("%d\t%d\t%s\t%d\t%s\t%s\n",
+        $object_xref->{'object_xref_id'},
+        $object_xref->{'ensembl_id'},
+        $object_xref->{'ensembl_object_type'},
+        $xref->{'xref_id'},
+        '\N',
+        $analysis_id
+      );
+    }
+  }
+  $fh->close();
+}
+
+sub dump_unmapped_reason {
+  my ($filename, $unmapped_reason_id, $unmapped, $core_dbi) = @_;
+
+  # Create a list of the unique reasons.
+  my %reasons;
+
+  foreach my $xref (values(%{$unmapped})) {
+    if (!exists($reasons{$xref->{'reason_full'}})) {
+      $reasons{$xref->{'reason_full'}} = {
+        'summary' => $xref->{'reason'},
+        'full' => $xref->{'reason_full'}
+      };
+    }
+  }
+
+  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+
+  my $sth = $core_dbi->prepare('SELECT unmapped_reason_id FROM unmapped_reason WHERE full_description = ?');
+
+  foreach my $reason (sort({ $a->{'full'} cmp $b->{'full'} } values(%reasons))) {
+    # Figure out 'unmapped_reason_id' from the core database.
+    $sth->bind_param(1, $reason->{'full'}, SQL_VARCHAR);
+    $sth->execute();
+
+    my $id;
+    $sth->bind_col(1, \$id);
+    $sth->fetch();
+
+    if (defined($id)) {
+      $reason->{'unmapped_reason_id'} = $id;
+    } else {
+      $reason->{'unmapped_reason_id'} = ++$unmapped_reason_id;
+    }
+
+    $sth->finish();
+
+    $fh->printf("%d\t%s\t%s\n",
+      $reason->{'unmapped_reason_id'},
+      $reason->{'summary'},
+      $reason->{'full'}
+    );
+
+  }
+  $fh->close();
+
+  # Assign reasons to the unmapped Xrefs from %reasons.
+  foreach my $xref (values(%{$unmapped})) {
+    $xref->{'reason'}      = $reasons{$xref->{'reason_full'}};
+    $xref->{'reason_full'} = undef;
+  }
+}
+
+sub dump_unmapped_object {
+  my ($filename, $unmapped_object_id, $analysis_id, $unmapped) = @_;
+
+  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+
+  foreach my $xref (values(%{$unmapped})) {
+    # Assign 'unmapped_object_id' to this Xref.
+    $xref->{'unmapped_object_id'} = ++$unmapped_object_id;
+
+    $fh->printf(
+      "%d\t%s\t%s\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n",
+      $xref->{'unmapped_object_id'},
+      'xref',
+      $analysis_id || '\N',    # '\N' (NULL) means no analysis exists and uploading this table will fail.
+      $xref->{'external_db_id'},
+      $xref->{'accession'},
+      $xref->{'reason'}->{'unmapped_reason_id'},
+      (defined($xref->{'score'}) ? sprintf("%.3f", $xref->{'score'}) : '\N'),
+      '\N',
+      $xref->{'ensembl_id'} || '\N',
+      (defined($xref->{'ensembl_id'}) ? 'Transcript' : '\N'),
+      '\N'
+    );
+  }
+  $fh->close();
+}
+
+sub upload_data {
+  my ($table_name, $filename, $external_db_id, $dbi) = @_;
+
+  if (!-r $filename) {
+    croak(sprintf("Can not open '%s' for reading", $filename));
+  }
+
+  my $cleanup_sql = '';
+  if ($table_name eq 'unmapped_reason') {
+    $cleanup_sql = qq(
+      DELETE  ur
+      FROM    unmapped_object uo,
+              unmapped_reason ur
+      WHERE   uo.external_db_id       = ?
+      AND     ur.unmapped_reason_id   = uo.unmapped_reason_id
+    );
+  } elsif ($table_name eq 'unmapped_object') {
+    $cleanup_sql = qq(
+      DELETE  uo
+      FROM    unmapped_object uo
+      WHERE   uo.external_db_id = ?
+    );
+  } elsif ($table_name eq 'object_xref') {
+    $cleanup_sql = qq(
+      DELETE  ox
+      FROM    xref x,
+              object_xref ox
+      WHERE   x.external_db_id    = ?
+      AND     ox.xref_id          = x.xref_id
+    );
+  } elsif ($table_name eq 'xref') {
+    $cleanup_sql = qq(
+      DELETE  x
+      FROM    xref x
+      WHERE   x.external_db_id    = ?
+    );
+  } else {
+    croak(sprintf("Table '%s' is unknown\n", $table_name));
+  }
+
+  my $load_sql = sprintf("LOAD DATA LOCAL INFILE ? REPLACE INTO TABLE %s", $table_name);
+
+  my $rows = $dbi->do($cleanup_sql, undef, $external_db_id) or croak($dbi->strerr());
+
+  $rows = $dbi->do($load_sql, undef, $filename) or croak($dbi->errstr());
+
+  $dbi->do("OPTIMIZE TABLE $table_name") or croak($dbi->errstr());
+}
\ No newline at end of file
diff --git a/scripts/xrefs/dump_ensembl.pl b/scripts/xrefs/dump_ensembl.pl
new file mode 100644
index 000000000..22132195d
--- /dev/null
+++ b/scripts/xrefs/dump_ensembl.pl
@@ -0,0 +1,86 @@
+#!/usr/bin/env perl
+#  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+#  Copyright [2016-2024] EMBL-European Bioinformatics Institute
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+use Data::Dumper;
+use IO::File;
+use Getopt::Long;
+use Carp;
+
+use Nextflow::Utils;
+use Bio::EnsEMBL::Registry;
+use Bio::EnsEMBL::Utils::IO::FASTASerializer;
+
+my ($cdna_path, $pep_path, $species, $core_db_url, $release);
+GetOptions(
+  'cdna_path=s'   => \$cdna_path,
+  'pep_path=s'    => \$pep_path,
+  'species=s'     => \$species,
+  'core_db_url=s' => \$core_db_url,
+  'release=s'     => \$release
+);
+
+# Check that all parameters are passed
+if (!defined($cdna_path) || !defined($pep_path) || !defined($species) || !defined($core_db_url) || !defined($release)) {
+  croak "Usage: dump_ensembl.pl --cdna_path <cdna_path> --pep_path <pep_path> --species <species> --core_db_url <core_db_url> --release <release>";
+}
+
+# Open fasta files for writing
+my $cdna_fh = IO::File->new($cdna_path ,'w') || throw("Cannot create filehandle $cdna_path");
+my $cdna_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($cdna_fh);
+my $pep_fh = IO::File->new($pep_path ,'w') || throw("Cannot create filehandle $pep_path");
+my $pep_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($pep_fh);
+
+# Load the registry
+my ($user, $pass, $host, $port, $dbname) = parse_url($core_db_url);
+my $registry = 'Bio::EnsEMBL::Registry';
+my %registry_params = (-HOST => $host, -PORT => $port, -USER => $user, -DB_VERSION => $release);
+$registry_params{-PASS} = $pass if ($pass);
+$registry->load_registry_from_db(%registry_params);
+
+# Get transcripts
+my $transcript_adaptor = $registry->get_adaptor($species, 'Core', 'Transcript');
+my $transcript_list = $transcript_adaptor->fetch_all();
+
+# Dump sequence data
+while (my $transcript = shift @$transcript_list) {
+  my $sequence = $transcript->seq();
+  $sequence->id($transcript->dbID());
+  $cdna_writer->print_Seq($sequence);
+
+  # Get and dump translation data
+  my $translation = $transcript->translation;
+  if ($translation) {
+    $sequence = $transcript->translate;
+    $sequence->id($translation->dbID());
+    $pep_writer->print_Seq($sequence);
+  }
+}
+
+# Close file handles
+$cdna_fh->close;
+$pep_fh->close;
+
+sub parse_url {
+  my ($url) = @_;
+  my $parsed_url = Nextflow::Utils::parse($url);
+  my $user = $parsed_url->{'user'};
+  my $pass = $parsed_url->{'pass'};
+  my $host = $parsed_url->{'host'};
+  my $port = $parsed_url->{'port'};
+  my $db   = $parsed_url->{'dbname'};
+  return ($user, $pass, $host, $port, $db);
+}
\ No newline at end of file
diff --git a/scripts/xrefs/refseq_coordinate_parser.pl b/scripts/xrefs/refseq_coordinate_parser.pl
new file mode 100644
index 000000000..808284ee4
--- /dev/null
+++ b/scripts/xrefs/refseq_coordinate_parser.pl
@@ -0,0 +1,468 @@
+#!/usr/bin/env perl
+#  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+#  Copyright [2016-2024] EMBL-European Bioinformatics Institute
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+use Data::Dumper;
+use Carp;
+use DBI;
+use JSON;
+use Getopt::Long;
+
+use Nextflow::Utils;
+use Bio::EnsEMBL::Registry;
+use Bio::EnsEMBL::Mapper::RangeRegistry;
+
+my ($xref_db_url, $core_db_url, $otherf_db_url, $source_ids_json, $species_id, $species_name, $release);
+GetOptions(
+  'xref_db_url=s'   => \$xref_db_url,
+  'core_db_url=s'   => \$core_db_url,
+  'otherf_db_url=s' => \$otherf_db_url,
+  'source_ids=s'    => \$source_ids_json,
+  'species_id=i'    => \$species_id,
+  'species_name=s'  => \$species_name,
+  'release=i'       => \$release
+);
+
+# Check that all parameters are passed
+if (!defined($xref_db_url) || !defined($core_db_url) || !defined($otherf_db_url) || !defined($source_ids_json) || !defined($species_id) || !defined($species_name) || !defined($release)) {
+  croak "Usage: dump_ensembl.pl --xref_db_url <xref_db_url> --core_db_url <core_db_url> --otherf_db_url <otherf_db_url> --source_ids <source_ids [json]> --species_id <species_id> --species_name <species_name> --release <release>";
+}
+
+my $transcript_score_threshold = 0.75;
+my $tl_transcript_score_threshold = 0.75;
+
+# Extract the source ids
+my $source_ids = decode_json($source_ids_json);
+
+# Connect to the xref db
+my ($user, $pass, $host, $port, $xref_db) = parse_url($xref_db_url);
+my $dbi = get_dbi($host, $port, $user, $pass, $xref_db);
+
+# Load the registry
+my $registry = 'Bio::EnsEMBL::Registry';
+my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url);
+my ($otherf_user, $otherf_pass, $otherf_host, $otherf_port, $otherf_dbname) = parse_url($otherf_db_url);
+$registry->load_registry_from_multiple_dbs(
+  {
+    -host => $core_host,
+    -port => $core_port,
+    -user => $core_user,
+    -pass => $core_pass || '',
+    -fb_version => $release
+  },
+  {
+    -host => $otherf_host,
+    -port => $otherf_port,
+    -user => $otherf_user,
+    -pass => $otherf_pass || '',
+    -fb_version => $release
+  },
+);
+
+# Get the EntrezGene and WikiGene accessions
+my (%entrez_ids) = %{ get_valid_codes("EntrezGene", $species_id, $dbi) };
+my (%wiki_ids)   = %{ get_valid_codes('WikiGene', $species_id, $dbi) };
+
+# Prepare link sql
+my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
+
+# Get the db adaptors
+my $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures');
+my $core_dba = $otherf_dba->dnadb();
+
+# Get the slice adaptors
+my $otherf_sa = $otherf_dba->get_SliceAdaptor();
+my $core_sa = $core_dba->get_SliceAdaptor();
+
+# Fetch analysis object for refseq
+my $logic_name;
+my $otherf_aa = $otherf_dba->get_AnalysisAdaptor();
+foreach my $analysis_adaptor (@{ $otherf_aa->fetch_all() }) {
+  if ($analysis_adaptor->logic_name =~ /refseq_import/) {
+    $logic_name = $analysis_adaptor->logic_name;
+  }
+}
+
+# Not all species have refseq_import data, skip if not found
+if (!defined $logic_name) {
+  print STDERR "No data found for RefSeq_import, skipping import\n";;
+  return;
+}
+
+# Get otherfeatures chromosomes
+my $otherf_chromosomes = $otherf_sa->fetch_all('toplevel', undef, 1);
+foreach my $otherf_chromosome (@$otherf_chromosomes) {
+  my $chr_name = $otherf_chromosome->seq_region_name();
+
+  # Get otherfeatures genes
+  my $otherf_genes = $otherf_chromosome->get_all_Genes($logic_name, undef, 1);
+  while (my $otherf_gene = shift @$otherf_genes) {
+    # Get otherfeatures transcripts
+    my $otherf_transcripts = $otherf_gene->get_all_Transcripts();
+    foreach my $otherf_transcript (sort { $a->start() <=> $b->start() } @$otherf_transcripts) {
+      # Get the RefSeq accession (either the display xref or the stable ID)
+      my $refseq_acc;
+      if (defined $otherf_transcript->display_xref) {
+        $refseq_acc = $otherf_transcript->display_xref->display_id;
+      } elsif (defined $otherf_transcript->stable_id) {
+        $refseq_acc = $otherf_transcript->stable_id;
+      } else {
+        # Skip non conventional accessions
+        next;
+      }
+      next if (!defined($refseq_acc) || $refseq_acc !~ /^[NXMR]{2}_[0-9]+/);
+
+      my (%transcript_result, %tl_transcript_result);
+      my ($start, $end, $overlap);
+
+      # Get otherfeatures exons
+      my $otherf_exons = $otherf_transcript->get_all_Exons();
+      my $otherf_tl_exons = $otherf_transcript->get_all_translateable_Exons();
+
+      # Create a range registry for all the exons of the refseq transcript
+      my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+      my $rr3 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+
+      foreach my $otherf_exon (@$otherf_exons) {
+        $start = $otherf_exon->seq_region_start();
+        $end = $otherf_exon->seq_region_end();
+        $rr1->check_and_register('exon', $start, $end);
+      }
+
+      foreach my $otherf_tl_exon (@$otherf_tl_exons) {
+        $start = $otherf_tl_exon->seq_region_start();
+        $end = $otherf_tl_exon->seq_region_end();
+        $rr3->check_and_register('exon', $start, $end);
+      }
+
+      # Fetch slice in core database which overlaps refseq transcript
+      my $core_chromosome = $core_sa->fetch_by_region('toplevel', $chr_name, $otherf_transcript->seq_region_start, $otherf_transcript->seq_region_end);
+
+      # Get core transcripts
+      my $core_transcripts = $core_chromosome->get_all_Transcripts(1);
+      foreach my $core_transcript (@$core_transcripts) {
+        next if ($core_transcript->strand != $otherf_transcript->strand);
+
+        # Get core exons
+        my $core_exons = $core_transcript->get_all_Exons();
+        my $core_tl_exons = $core_transcript->get_all_translateable_Exons();
+
+        # Create a range registry for all the exons of the ensembl transcript
+        my $rr2 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+        my $rr4 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
+
+        my ($core_exon_match, $core_tl_exon_match, $otherf_exon_match, $otherf_tl_exon_match) = (0, 0, 0, 0);
+
+        foreach my $core_exon (@$core_exons) {
+          $start = $core_exon->seq_region_start();
+          $end = $core_exon->seq_region_end();
+          $overlap = $rr1->overlap_size('exon', $start, $end);
+          $core_exon_match += $overlap/($end - $start + 1);
+          $rr2->check_and_register('exon', $start, $end);
+        }
+
+        foreach my $core_tl_exon (@$core_tl_exons) {
+          $start = $core_tl_exon->seq_region_start();
+          $end = $core_tl_exon->seq_region_end();
+          $overlap = $rr3->overlap_size('exon', $start, $end);
+          $core_tl_exon_match += $overlap/($end - $start + 1);
+          $rr4->check_and_register('exon', $start, $end);
+        }
+
+        # Look for oeverlap between the two sets of exons
+        foreach my $otherf_exon (@$otherf_exons) {
+          $start = $otherf_exon->seq_region_start();
+          $end = $otherf_exon->seq_region_end();
+          $overlap = $rr2->overlap_size('exon', $start, $end);
+          $otherf_exon_match += $overlap/($end - $start + 1);
+        }
+
+        foreach my $otherf_tl_exon (@$otherf_tl_exons) {
+          $start = $otherf_tl_exon->seq_region_start();
+          $end = $otherf_tl_exon->seq_region_end();
+          $overlap = $rr4->overlap_size('exon', $start, $end);
+          $otherf_tl_exon_match += $overlap/($end - $start + 1);
+        }
+
+        # Compare exon matching with number of exons to give a score
+        my $score = ( ($otherf_exon_match + $core_exon_match)) / (scalar(@$otherf_exons) + scalar(@$core_exons) );
+        my $tl_score = 0;
+        if (scalar(@$otherf_tl_exons) > 0) {
+          $tl_score = ( ($otherf_tl_exon_match + $core_tl_exon_match)) / (scalar(@$otherf_tl_exons) + scalar(@$core_tl_exons) );
+        }
+        if ($core_transcript->biotype eq $otherf_transcript->biotype) {
+          $transcript_result{$core_transcript->stable_id} = $score;
+          $tl_transcript_result{$core_transcript->stable_id} = $tl_score;
+        } else {
+          $transcript_result{$core_transcript->stable_id} = $score * 0.90;
+          $tl_transcript_result{$core_transcript->stable_id} = $tl_score * 0.90;
+        }
+      }
+
+      my ($best_score, $best_tl_score) = (0, 0);
+      my ($best_id, $score, $tl_score);
+
+      # Compare the scores based on coding exon overlap
+      # If there is a stale mate, chose best exon overlap score
+      foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
+        $score = $transcript_result{$tid};
+        $tl_score = $tl_transcript_result{$tid};
+
+        if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) {
+          if ($tl_score >= $best_tl_score) {
+            if ($tl_score > $best_tl_score) {
+              $best_id = $tid;
+              $best_score = $score;
+              $best_tl_score = $tl_score;
+            } elsif ($tl_score == $best_tl_score) {
+              if ($score > $best_score) {
+                $best_id = $tid;
+                $best_score = $score;
+              }
+            }
+          }
+          if (!defined $best_id) { 
+            if ($score >= $best_score) {
+              $best_id = $tid;
+              $best_score = $score;
+            }
+          }
+        }
+      }
+
+      # If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript
+      if ($best_id) {
+        my ($acc, $version) = split(/\./, $refseq_acc);
+        $version =~ s/\D//g if $version;
+
+        # Set the appropriate source ID
+        my $source_id;
+        $source_id = $source_ids->{'mrna'} if $acc =~ /^NM_/;
+        $source_id = $source_ids->{'ncrna'} if $acc =~ /^NR_/;
+        $source_id = $source_ids->{'mrna_predicted'} if $acc =~ /^XM_/;
+        $source_id = $source_ids->{'ncrna_predicted'} if $acc =~ /^XR_/;
+        next if (!defined($source_id));
+
+        my $xref_id = add_xref({
+          acc        => $acc,
+          version    => $version,
+          label      => $refseq_acc,
+          desc       => undef,
+          source_id  => $source_id,
+          species_id => $species_id,
+          dbi        => $dbi,
+          info_type  => 'DIRECT'
+        });
+        add_direct_xref($xref_id, $best_id, "Transcript", "", $dbi);
+
+        my $otherf_gene = $otherf_transcript->get_Gene();
+        my $entrez_id = $otherf_gene->stable_id();
+        my $otherf_translation = $otherf_transcript->translation();
+        my $core_ta = $core_dba->get_TranscriptAdaptor();
+        my $transcript = $core_ta->fetch_by_stable_id($best_id);
+        my $translation = $transcript->translation();
+
+        # Add link between Ensembl gene and EntrezGene (and WikiGene)
+        if (defined $entrez_ids{$entrez_id} ) {
+          foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
+            $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $source_ids->{'entrezgene'});
+          }
+          foreach my $dependent_xref_id (@{$wiki_ids{$entrez_id}}) {
+            $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $source_ids->{'wikigene'});
+          }
+        }
+
+        # Also store refseq protein as direct xref for ensembl translation, if translation exists
+        if (defined $translation && defined $otherf_translation && ($otherf_translation->seq eq $translation->seq)) {
+          my $translation_id = $otherf_translation->stable_id();
+          my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$otherf_translation->get_all_DBEntries};
+          if (scalar @xrefs == 1) {
+            $translation_id = $xrefs[0]->primary_id();
+          }
+
+          ($acc, $version) = split(/\./, $translation_id);
+
+          $source_id = $source_ids->{'peptide'};
+          $source_id = $source_ids->{'peptide_predicted'} if $acc =~ /^XP_/;
+          my $tl_xref_id = add_xref({
+            acc => $acc,
+            version => $version,
+            label => $translation_id,
+            desc => undef,
+            source_id => $source_id,
+            species_id => $species_id,
+            dbi => $dbi,
+            info_type => 'DIRECT'
+          });
+          add_direct_xref($tl_xref_id, $translation->stable_id(), "Translation", "", $dbi);
+        }
+      }
+    }
+  }
+}
+
+sub parse_url {
+  my ($url) = @_;
+
+  my $parsed_url = Nextflow::Utils::parse($url);
+  my $user = $parsed_url->{'user'};
+  my $pass = $parsed_url->{'pass'};
+  my $host = $parsed_url->{'host'};
+  my $port = $parsed_url->{'port'};
+  my $db   = $parsed_url->{'dbname'};
+
+  return ($user, $pass, $host, $port, $db);
+}
+
+sub get_dbi {
+  my ($host, $port, $user, $pass, $dbname) = @_;
+
+  my $dbconn;
+  if (defined $dbname) {
+    $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname);
+  } else {
+    $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
+  }
+  my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr );
+
+  return $dbi;
+}
+
+sub get_valid_codes{
+  my ($source_name, $species_id, $dbi) = @_;
+
+  my %valid_codes;
+  my @sources;
+
+  my $big_name = uc $source_name;
+  my $sql = "select source_id from source where upper(name) like '%$big_name%'";
+  my $sth = $dbi->prepare($sql);
+  $sth->execute();
+  while(my @row = $sth->fetchrow_array()){
+    push @sources,$row[0];
+  }
+  $sth->finish;
+
+  foreach my $source (@sources){
+    $sql = "select accession, xref_id from xref where species_id = $species_id and source_id = $source";
+    $sth = $dbi->prepare($sql);
+    $sth->execute();
+    while(my @row = $sth->fetchrow_array()){
+      push @{$valid_codes{$row[0]}}, $row[1];
+    }
+  }
+  $sth->finish();
+
+  return \%valid_codes;
+}
+
+sub add_xref {
+  my ($arg_ref) = @_;
+
+  my $acc         = $arg_ref->{acc}        || croak 'add_xref needs aa acc';
+  my $source_id   = $arg_ref->{source_id}  || croak 'add_xref needs a source_id';
+  my $species_id  = $arg_ref->{species_id} || croak 'add_xref needs a species_id';
+  my $label       = $arg_ref->{label}      // $acc;
+  my $description = $arg_ref->{desc};
+  my $version     = $arg_ref->{version}    // 0;
+  my $info_type   = $arg_ref->{info_type}  // 'MISC';
+  my $info_text   = $arg_ref->{info_text}  // q{};
+  my $dbi         = $arg_ref->{dbi};
+
+  # See if it already exists. If so return the existing xref_id
+  my $xref_id;
+  my $get_xref_sth = $dbi->prepare('SELECT xref_id FROM xref WHERE accession = ? AND source_id = ? AND species_id = ?');
+  $get_xref_sth->execute($acc, $source_id, $species_id) or croak( $dbi->errstr() );
+  if (my @row = $get_xref_sth->fetchrow_array()) {
+    $xref_id = $row[0];
+  }
+  $get_xref_sth->finish();
+
+  if(defined $xref_id){
+    return $xref_id;
+  }
+
+  my $add_xref_sth = $dbi->prepare('INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type, info_text) VALUES(?,?,?,?,?,?,?,?)');
+
+  # If the description is more than 255 characters, chop it off
+  if (defined $description && ((length $description) > 255 )) {
+    my $truncmsg = ' /.../';
+    substr $description, 255 - (length $truncmsg), length $truncmsg, $truncmsg;
+  }
+
+  # Add the xref and croak if it fails
+  $add_xref_sth->execute($acc, $version || 0, $label, $description, $source_id, $species_id, $info_type, $info_text) 
+    or croak("$acc\t$label\t\t$source_id\t$species_id\n");
+
+  $add_xref_sth->finish();
+
+  return $add_xref_sth->{'mysql_insertid'};
+}
+
+sub add_direct_xref {
+  my ($general_xref_id, $ensembl_stable_id, $ensembl_type, $linkage_type, $dbi) = @_;
+
+  # Check if such a mapping exists yet
+  my @existing_xref_ids = get_direct_xref($ensembl_stable_id, $ensembl_type, $linkage_type, $dbi);
+  if (scalar grep { $_ == $general_xref_id } @existing_xref_ids) {
+    return;
+  }
+
+  $ensembl_type = lc($ensembl_type);
+  my $add_direct_xref_sth = $dbi->prepare('INSERT INTO ' . $ensembl_type . '_direct_xref VALUES (?,?,?)');
+
+  $add_direct_xref_sth->execute($general_xref_id, $ensembl_stable_id, $linkage_type);
+  $add_direct_xref_sth->finish();
+
+  return;
+}
+
+sub get_direct_xref{
+  my ($stable_id, $type, $link, $dbi) = @_;
+
+  $type = lc $type;
+
+  my $sql = "SELECT general_xref_id FROM ${type}_direct_xref d WHERE ensembl_stable_id = ? AND linkage_xref";
+  my @sql_params = ( $stable_id );
+  if (defined $link) {
+    $sql .= '= ?';
+    push @sql_params, $link;
+  } else {
+    $sql .= 'is null';
+  }
+  my $direct_sth = $dbi->prepare($sql);
+
+  $direct_sth->execute( @sql_params ) || croak( $dbi->errstr() );
+  if (wantarray ()) {
+    # Generic behaviour
+    my @results;
+
+    my $all_rows = $direct_sth->fetchall_arrayref();
+    foreach my $row_ref ( @{ $all_rows } ) {
+      push @results, $row_ref->[0];
+    }
+
+    return @results;
+  } else {
+    # Backwards-compatible behaviour
+    if (my @row = $direct_sth->fetchrow_array()) {
+      return $row[0];
+    }
+  }
+  $direct_sth->finish();
+
+  return;
+}
\ No newline at end of file
diff --git a/src/python/ensembl/common/Params.py b/src/python/ensembl/common/Params.py
index ef9371f99..b7a163a14 100644
--- a/src/python/ensembl/common/Params.py
+++ b/src/python/ensembl/common/Params.py
@@ -19,215 +19,233 @@
 import json
 import argparse
 
+from typing import Dict, Any
+
 sys.tracebacklimit = 0
 
+
 class Params:
-  def __init__(self, params: dict=None, parse_dataflow_json: bool=True) -> None:
-    """
-    Parameters
-    ----------
-    params: dict, optional
-        The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None)
-    parse_dataflow_json: bool, optional
-        Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
-    """
-    if params:
-      self._params = params
-    else:
-      self._params = {}
-      self.parse_argv_params(parse_dataflow_json)
-
-  def parse_argv_params(self, parse_dataflow_json: bool=True):
-    """Parses command-line arguments and extracts them into the Params object.
-    Command-line arguments need to be passed in the format "--name value".
-
-    Parameters
-    ----------
-    parse_dataflow_json: bool, optional
-        Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
-    """
-    args = sys.argv[1:]
-
-    # Extract param names from command line
-    r = re.compile(r"^--")
-    param_names = list(filter(r.match, args))
-
-    parser = argparse.ArgumentParser()
-    for name in param_names:
-      parser.add_argument(name)
-
-    params = parser.parse_args()
-    for param_name in vars(params):
-      if param_name == 'dataflow' and parse_dataflow_json:
-        dataflow_params = json.loads(getattr(params, param_name))
-        for name,value in dataflow_params.items():
-          self.param(name, value)
-      else:
-        self.param(param_name, getattr(params, param_name))
-
-  def param(self, name: str, new_value=None, options: dict={}):
-    """ Gets or sets a parameter value.
-
-    Parameters
-    ----------
-    name: str
-        The name of the paramater
-    new_value: any, optional
-        The value to set the parameter to (default is None)
-    options: dict, optional
-        Extra options, including:
-        - default: The default value to use if parameter has no value (sets the parameter value to this)
-        - type: The type of the parameter value, used to check if value is valid
-
-    Returns
-    -------
-    The value of the parameter with provided name.
-
-    Raises
-    ------
-    AttributeError
-        If no parameter name was passed.
-    """
-    if not name:
-      raise AttributeError('You must supply a parameter name')
-
-    value = None
-
-    if new_value is not None:
-      self._params[name] = new_value
-      value = new_value
-    else:
-      value = self._params.get(name)
-      if value is None and options.get('default') is not None:
-        default = options['default']
-        self._params[name] = default
-        value = default
-
-    if options.get('type'):
-      return self.check_type(name, value, options['type'])
-
-    return value
-
-  def param_required(self, name: str, options: dict={}):
-    """ Gets a parameter value, raising an error if no value is found.
-    
-    Parameters
-    ----------
-    name: str
-        The name of th parameter
-    options: dict, optional
-        Extra options, including:
-        - default: The default value to use if parameter has no value (sets the parameter value to this)
-        - type: The type of the parameter value, used to check if value is valid
-
-    Returns
-    -------
-    The value of the parameter with provided name.
-
-    Raises
-    ------
-    AttributeError
-        If no value is found for the required paramater.
-    """
-    value = self.param(name, None, options)
-
-    if value is None:
-      raise AttributeError(f'Parameter \'{name}\' is required but has no value')
-
-    return value
-
-  def check_type(self, name: str, value, value_type: str):
-    """ Checks if the parameter value provided is valid.
-    For specific types, this function can change the parameter value.
-
-    Parameters
-    ----------
-    name: str
-        The name of the parameter
-    value: any
-        The value of the parameter
-    value_type: str
-        The type of the parameter value. Accepted types:
-        - hash, dict, or dictionary
-        - array or list
-        - int or integer
-        - bool or boolean
-        - str or string
-
-    Returns
-    -------
-    None if no value is found, or the new value of the parameter with provided name.
-
-    Raises
-    ------
-    AttributeError
-        If no parameter name is provided.
-        If parameter value is not valid.
-    """
-    if not name:
-      raise AttributeError('You must supply a parameter name')
-    if value is None:
-      return
-
-    value_type = value_type.lower()
-    error = 0
-    new_value = None
-
-    if value_type in ['hash', 'dict', 'dictionary'] and not isinstance(value, dict):
-      error = 1
-    elif value_type in ['array', 'list'] and not isinstance(value, list):
-      # Try to split by commas
-      if re.search(",", value):
-        new_value = value.split(",")
-      else:
-        new_value = [value]
-    elif value_type in ['integer', 'int'] and not isinstance(value, int):
-      # Try to make it an integer
-      try:
-        new_value = int(value)
-      except ValueError:
-        error = 1
-    elif value_type in ['bool', 'boolean'] and not isinstance(value, bool):
-      # Try to make it a boolean
-      if isinstance(value, int):
-        new_value = bool(value)
-      elif value in ['0', '1']:
-        new_value = bool(int(value))
-      else:
-        error = 1
-    elif value_type in ['str', 'string'] and not isinstance(value, str):
-      new_value = str(value)
-
-    if error:
-      raise AttributeError(f'Parameter \'{name}\' has an invalid value \'{value}\'. Must be of type {value_type}')
-
-    self.param(name, new_value)
-    return new_value
-
-  def write_output(self, suffix: str, params: dict):
-    """ Appends data to the dataflow json file (passed into next pipeline process).
-
-    Parameters
-    ----------
-    suffix: str
-        The file suffix to add to the output file name (dataflow_[suffix].json)
-    params: dict
-        The data to append into the file
-    """
-    # Remove null params
-    params = {k: v for k, v in params.items() if v is not None}
-
-    with open(f'dataflow_{suffix}.json', 'a') as fh:
-      json.dump(params, fh)
-      fh.write("\n")
-
-  def write_all_output(self, suffix: str):
-    """ Appends all of the parameters in the object into the dataflow json file.
-    This calls the write_output function.
-
-    Parameters
-    ----------
-    suffix: str
-        The file suffix to add to the output file name (dataflow_[suffix].json)
-    """
-    self.write_output(suffix, self._params)
+    def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None:
+        """Params constructor.
+
+        Parameters
+        ----------
+        params: dict, optional
+            The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None)
+        parse_dataflow_json: bool, optional
+            Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
+        """
+        if params is None:
+            params = {}
+
+        if params:
+            self._params = params
+        else:
+            self._params = {}
+            self.parse_argv_params(parse_dataflow_json)
+
+    def parse_argv_params(self, parse_dataflow_json: bool = True) -> None:
+        """Parses command-line arguments and extracts them into the Params object.
+        Command-line arguments need to be passed in the format "--name value".
+
+        Parameters
+        ----------
+        parse_dataflow_json: bool, optional
+            Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
+        """
+        args = sys.argv[1:]
+
+        # Extract param names from command line
+        r = re.compile(r"^--")
+        param_names = list(filter(r.match, args))
+
+        parser = argparse.ArgumentParser()
+        for name in param_names:
+            parser.add_argument(name)
+
+        params = parser.parse_args()
+        for param_name in vars(params):
+            if param_name == "dataflow" and parse_dataflow_json:
+                dataflow_params = json.loads(getattr(params, param_name))
+                for name, value in dataflow_params.items():
+                    self.param(name, value)
+            else:
+                self.param(param_name, getattr(params, param_name))
+
+    def param(self, name: str, new_value: Any = None, options: Dict[str, Any] = None) -> Any:
+        """Gets or sets a parameter value.
+
+        Parameters
+        ----------
+        name: str
+            The name of the paramater
+        new_value: any, optional
+            The value to set the parameter to (default is None)
+        options: dict, optional
+            Extra options, including:
+            - default: The default value to use if parameter has no value (sets the parameter value to this)
+            - type: The type of the parameter value, used to check if value is valid
+
+        Returns
+        -------
+        The value of the parameter with provided name.
+
+        Raises
+        ------
+        AttributeError
+            If no parameter name was passed.
+        """
+        if not name:
+            raise AttributeError("You must supply a parameter name")
+        if options is None:
+            options = {}
+
+        value = None
+
+        if new_value is not None:
+            self._params[name] = new_value
+            value = new_value
+        else:
+            value = self._params.get(name)
+            if value is None and options.get("default") is not None:
+                default = options["default"]
+                self._params[name] = default
+                value = default
+
+        if options.get("type"):
+            return self.check_type(name, value, options["type"])
+
+        return value
+
+    def param_required(self, name: str, options: Dict[str, Any] = None) -> Any:
+        """Gets a parameter value, raising an error if no value is found.
+
+        Parameters
+        ----------
+        name: str
+            The name of th parameter
+        options: dict, optional
+            Extra options, including:
+            - default: The default value to use if parameter has no value (sets the parameter value to this)
+            - type: The type of the parameter value, used to check if value is valid
+
+        Returns
+        -------
+        The value of the parameter with provided name.
+
+        Raises
+        ------
+        AttributeError
+            If no value is found for the required paramater.
+        """
+        value = self.param(name, None, options)
+
+        if value is None:
+            raise AttributeError(f"Parameter '{name}' is required but has no value")
+
+        return value
+
+    def check_type(self, name: str, value: Any, value_type: str) -> Any:
+        """Checks if the parameter value provided is valid.
+        For specific types, this function can change the parameter value.
+
+        Parameters
+        ----------
+        name: str
+            The name of the parameter
+        value: any
+            The value of the parameter
+        value_type: str
+            The type of the parameter value. Accepted types:
+            - hash, dict, or dictionary
+            - array or list
+            - int or integer
+            - bool or boolean
+            - str or string
+
+        Returns
+        -------
+        None if no value is found, or the new value of the parameter with provided name.
+
+        Raises
+        ------
+        AttributeError
+            If no parameter name is provided.
+            If parameter value is not valid.
+        """
+        if not name:
+            raise AttributeError("You must supply a parameter name")
+        if value is None:
+            return
+
+        value_type = value_type.lower()
+        error, update = False, True
+        new_value = None
+
+        if value_type in ["hash", "dict", "dictionary"] and not isinstance(value, dict):
+            error = True
+        elif value_type in ["array", "list"] and not isinstance(value, list):
+            # Try to split by commas
+            if re.search(",", value):
+                new_value = value.split(",")
+            else:
+                new_value = [value]
+        elif value_type in ["int", "integer"] and not isinstance(value, int):
+            # Try to make it an integer
+            try:
+                new_value = int(value)
+            except ValueError:
+                error = True
+        elif value_type in ["bool", "boolean"] and not isinstance(value, bool):
+            # Try to make it a boolean
+            if isinstance(value, int):
+                new_value = bool(value)
+            elif isinstance(value, str) and value in ["True", "False"]:
+                new_value = bool(value)
+            elif value in ["0", "1", 0, 1]:
+                new_value = bool(int(value))
+            else:
+                error = True
+        elif value_type in ["str", "string"] and not isinstance(value, str):
+            new_value = str(value)
+        else:
+            update = False
+
+        if error:
+            raise AttributeError(
+                f"Parameter '{name}' has an invalid value '{value}'. Must be of type {value_type}"
+            )
+
+        if update:
+            self.param(name, new_value)
+            value = new_value
+
+        return value
+
+    def write_output(self, suffix: str, params: Dict[str, Any]) -> None:
+        """Appends data to the dataflow json file (passed into next pipeline process).
+
+        Parameters
+        ----------
+        suffix: str
+            The file suffix to add to the output file name (dataflow_[suffix].json)
+        params: dict
+            The data to append into the file
+        """
+        # Remove null params
+        params = {k: v for k, v in params.items() if v is not None}
+
+        with open(f"dataflow_{suffix}.json", "a") as fh:
+            json.dump(params, fh)
+            fh.write("\n")
+
+    def write_all_output(self, suffix: str) -> None:
+        """Appends all of the parameters in the object into the dataflow json file.
+        This calls the write_output function.
+
+        Parameters
+        ----------
+        suffix: str
+            The file suffix to add to the output file name (dataflow_[suffix].json)
+        """
+        self.write_output(suffix, self._params)
diff --git a/src/python/ensembl/common/__init__.py b/src/python/ensembl/common/__init__.py
new file mode 100644
index 000000000..e68076fa5
--- /dev/null
+++ b/src/python/ensembl/common/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Common modules."""
diff --git a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
new file mode 100644
index 000000000..a869c1266
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
@@ -0,0 +1,39 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref module to print out advisory datachecks results (only needed now since we are still using perl datachecks)."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class AdvisoryXrefReport(Base):
+    def run(self):
+        base_path        = self.param_required("base_path", {"type": "str"})
+        species_name     = self.param_required("species_name", {"type": "str"})
+        release          = self.param_required("release", {"type": "int"})
+        datacheck_name   = self.param("datacheck_name", None, {"type": "str"})
+        datacheck_output = self.param("datacheck_output", None, {"type": "str"})
+
+        # Create or locate report file
+        report_file = self.get_path(
+            base_path, species_name, release, "dc_report", f"{datacheck_name}.log"
+        )
+
+        # Return the quotation marks into the output
+        datacheck_output = re.sub("__", "'", datacheck_output)
+
+        # Write datacheck result into file
+        with open(report_file, "a") as fh:
+            fh.write(datacheck_output)
+            fh.write("\n")
diff --git a/src/python/ensembl/production/xrefs/Alignment.py b/src/python/ensembl/production/xrefs/Alignment.py
new file mode 100644
index 000000000..b8ee417a1
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/Alignment.py
@@ -0,0 +1,91 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Alignment module to map xref sequences into ensEMBL ones."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class Alignment(Base):
+    def run(self):
+        base_path     = self.param_required("base_path", {"type": "str"})
+        method        = self.param_required("align_method", {"type": "str"})
+        query_cutoff  = self.param_required("query_cutoff", {"type": "int"})
+        target_cutoff = self.param_required("target_cutoff", {"type": "int"})
+        max_chunks    = self.param_required("max_chunks", {"type": "int"})
+        chunk         = self.param_required("chunk", {"type": "int"})
+        job_index     = self.param_required("job_index", {"type": "int"})
+        source        = self.param_required("source_file", {"type": "str"})
+        target        = self.param_required("target_file", {"type": "str"})
+        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
+        map_file      = self.param_required("map_file", {"type": "str"})
+        source_id     = self.param_required("source_id", {"type": "int"})
+        seq_type      = self.param_required("seq_type", {"type": "str"})
+
+        # Construct Exonerate command
+        ryo = "xref:%qi:%ti:%ei:%ql:%tl:%qab:%qae:%tab:%tae:%C:%s\n"
+        exe = (
+            subprocess.check_output("which exonerate", shell=True)
+            .decode("utf-8")
+            .strip()
+        )
+        command_string = f"{exe} --showalignment FALSE --showvulgar FALSE --ryo '{ryo}' --gappedextension FALSE --model 'affine:local' {method} --subopt no --query {source} --target {target} --querychunktotal {max_chunks} --querychunkid {chunk}"
+
+        # Get exonerate hits
+        output = subprocess.run(command_string, shell=True, stdout=subprocess.PIPE)
+
+        exit_code = abs(output.returncode)
+        if exit_code == 0:
+            hits = output.stdout.decode("utf-8").split("\n")
+
+            # Write to mapping file
+            map_fh = open(map_file, "w")
+            for hit in hits:
+                if re.search(r"^xref", hit):
+                    map_fh.write(f"{hit}\n")
+            map_fh.close()
+        elif exit_code == 9:
+            raise MemoryError(
+                f"Exonerate failed due to insufficient memory (exit code: {exit_code})"
+            )
+        elif exit_code == 256:
+            raise SyntaxError(
+                f"Exonerate failed due to unexpected character(s) in files (exit code: {exit_code})"
+            )
+        else:
+            raise Exception(f"Exonerate failed with exit_code: {output.returncode}")
+
+        # Add job and mapping data into db
+        db_engine = self.get_db_engine(xref_db_url)
+        with db_engine.connect() as xref_dbi:
+            out_file = f"xref_{seq_type}.{max_chunks}-{chunk}.out"
+            job_id = f"{source_id}{job_index}{chunk}"
+            xref_dbi.execute(
+                insert(MappingJobsORM).values(
+                    map_file=map_file,
+                    status="SUBMITTED",
+                    out_file=out_file,
+                    err_file=out_file,
+                    array_number=chunk,
+                    job_id=job_id,
+                )
+            )
+            xref_dbi.execute(
+                insert(MappingORM).values(
+                    job_id=job_id,
+                    method=seq_type,
+                    percent_query_cutoff=query_cutoff,
+                    percent_target_cutoff=target_cutoff,
+                )
+            )
diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py
index d5022627f..3a59abfc0 100644
--- a/src/python/ensembl/production/xrefs/Base.py
+++ b/src/python/ensembl/production/xrefs/Base.py
@@ -29,10 +29,11 @@
 import random
 import csv
 import subprocess
+import unicodedata
 
-from sqlalchemy import create_engine, select, insert, update, text, func, and_
+from sqlalchemy import create_engine, select, insert, update, text, func, and_, delete
 from sqlalchemy.engine.url import make_url, URL
-from sqlalchemy.engine import Connection
+from sqlalchemy.engine import Engine, Connection
 from sqlalchemy.orm import aliased
 from sqlalchemy_utils import database_exists, create_database, drop_database
 from urllib.parse import urlparse
@@ -40,822 +41,973 @@
 from itertools import groupby
 from configparser import ConfigParser
 from datetime import datetime
-
-from ensembl.xrefs.xref_source_db_model import Base as XrefSourceDB, Source as SourceSORM, Version as VersionORM, ChecksumXref as ChecksumXrefSORM
-
-from ensembl.xrefs.xref_update_db_model import Base as XrefUpdateDB, Source as SourceUORM, SourceURL as SourceURLORM, Xref as XrefUORM, \
-  PrimaryXref as PrimaryXrefORM, DependentXref as DependentXrefUORM, GeneDirectXref as GeneDirectXrefORM, TranscriptDirectXref as TranscriptDirectXrefORM, \
-  TranslationDirectXref as TranslationDirectXrefORM, Synonym as SynonymORM, Pairs as PairsORM, Species as SpeciesORM, \
-  SourceMappingMethod as SourceMappingMethodORM, MappingJobs as MappingJobsORM, Mapping as MappingORM
-
-from ensembl.core.models import Meta as MetaCORM, Gene as GeneORM, Transcript as TranscriptORM, Analysis as AnalysisORM, \
-  ExonTranscript as ExonTranscriptORM, SupportingFeature as SupportingFeatureORM, DnaAlignFeature as DnaAlignFeatureORM, \
-  TranscriptAttrib as TranscriptAttribORM, AttribType as AttribTypeORM, AnalysisDescription as AnalysisDescriptionORM, \
-  SeqRegion as SeqRegionORM, SeqRegionAttrib as SeqRegionAttribORM, CoordSystem as CoordSystemORM, Translation as TranslationORM, \
-  Exon as ExonORM, Xref as XrefCORM, DependentXref as DependentXrefCORM, ExternalDb as ExternalDbORM, Dna as DnaORM, ObjectXref as ObjectXrefCORM
+from pyspark import SparkConf
+from pyspark.sql import SparkSession
+from typing import IO, List, Dict, Any, Iterator, Optional
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
+
+from ensembl.xrefs.xref_source_db_model import (
+    Base as XrefSourceDB,
+    Source as SourceSORM,
+    Version as VersionORM,
+    ChecksumXref as ChecksumXrefSORM,
+)
+
+from ensembl.xrefs.xref_update_db_model import (
+    Base as XrefUpdateDB,
+    Source as SourceUORM,
+    SourceURL as SourceURLORM,
+    Xref as XrefUORM,
+    PrimaryXref as PrimaryXrefORM,
+    DependentXref as DependentXrefUORM,
+    CoordinateXref as CoordinateXrefORM,
+    GeneDirectXref as GeneDirectXrefORM,
+    TranscriptDirectXref as TranscriptDirectXrefORM,
+    TranslationDirectXref as TranslationDirectXrefORM,
+    Synonym as SynonymORM,
+    Pairs as PairsORM,
+    Species as SpeciesORM,
+    MappingJobs as MappingJobsORM,
+    Mapping as MappingORM,
+)
+
+from ensembl.core.models import (
+    Meta as MetaCORM,
+    Analysis as AnalysisORM,
+    AnalysisDescription as AnalysisDescriptionORM,
+    SeqRegion as SeqRegionORM,
+    CoordSystem as CoordSystemORM,
+    Dna as DnaORM,
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Translation as TranslationORM,
+    Exon as ExonORM,
+    ExonTranscript as ExonTranscriptORM,
+    SupportingFeature as SupportingFeatureORM,
+    DnaAlignFeature as DnaAlignFeatureORM,
+    AttribType as AttribTypeORM,
+    TranscriptAttrib as TranscriptAttribORM,
+    SeqRegionAttrib as SeqRegionAttribORM,
+    Xref as XrefCORM,
+    DependentXref as DependentXrefCORM,
+    ExternalDb as ExternalDbORM,
+    ObjectXref as ObjectXrefCORM,
+)
 
 from ensembl.common.Params import Params
 
+
 class Base(Params):
-  """ Class to represent the base of xref modules. Inherits the Params class.
-  """
-  def __init__(self, params: dict=None, parse_dataflow_json: bool=True) -> None:
-    """ Calls the parent __init__ then sets some specific parameters.
-
-    Parameters
-    ----------
-    params: dict, optional
-        The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None)
-    parse_dataflow_json: bool, optional
-        Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
-    """
-    super().__init__(params, parse_dataflow_json)
-
-    self.param('metasearch_url', "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch")
-
-    # Initialize the logfile for this run
-    if self.param('log_timestamp'):
-      current_timestamp = self.param('log_timestamp')
-    else:
-      current_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-
-    log_path = os.path.join(self.param_required('base_path'), 'logs', current_timestamp)
-    if not os.path.exists(log_path): os.makedirs(log_path, exist_ok = True)
-
-    log_file = os.path.join(log_path, 'tmp_logfile_'+self.__class__.__name__+'_'+str(random.randint(0, 5000)))
-    self._log_file = log_file
-
-    console_handler = logging.StreamHandler()
-    file_handler = logging.FileHandler(log_file, mode='a')
-    console_handler.setLevel(logging.WARNING)
-    file_handler.setLevel(logging.DEBUG)
-
-    logging.basicConfig(
-      level=logging.DEBUG,
-      format='%(asctime)s | %(levelname)s | %(message)s',
-      datefmt='%d-%b-%Y %H:%M:%S',
-      handlers=[console_handler, file_handler]
-    )
-
-  def create_source_db(self, source_url: str, reuse_db_if_present: bool):
-    """ Creates the xref source database from model.
-
-    Parameters
-    ----------
-    source_url: str
-        The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]
-    reuse_db_if_present: bool
-        If set to False, the database defined by provided URL will be dropped before creating a new one
-    """
-    url = make_url(source_url)
-    engine = create_engine(url, isolation_level="AUTOCOMMIT")
-
-    if url.database and reuse_db_if_present:
-      return
-
-    if database_exists(engine.url):
-      drop_database(engine.url)
-    create_database(engine.url)
-    XrefSourceDB.metadata.create_all(engine)
-
-  def download_file(self, file: str, base_path: str, source_name: str, extra_args: dict):
-    """ Downloads an xref file and saves into provided space.
-
-    Parameters
-    ----------
-    file: str
-        The URL of the file to download. Acceptable URL schemes: ftp, http, and https
-    base_path: str
-        The path to save the downloaded file into
-    source_name: str
-        The xref source name
-    extra_args: dict
-        Extra options, including:
-        - skip_download_if_file_present: If set to True, file is only downloaded if does not exist
-        - db: The type of external db for the xref source (only relevent here if equal to 'checksum')
-        - release: If set to 'version', then this is a version file download
-        - rel_number: The URL used to retrieve the release number (only for RefSeq)
-        - catalog: The URL used to retrieve the release catalog (only for RefSeq)
-
-    Returns
-    -------
-    The path of the downloaded file.
-
-    Raises
-    ------
-    LookupError
-        If rel_number is provided but no release number was found in URL.
-    AttributeError
-        If file URL scheme is invalid.
-    """
-    # Create uri object and get scheme
-    uri = urlparse(file)
-    if not uri.scheme:
-      return file
-
-    # Get extra parameters
-    skip_download_if_file_present = extra_args.get('skip_download_if_file_present') or False
-    db = extra_args.get('db')
-    release = extra_args.get('release')
-    rel_number = extra_args.get('rel_number')
-    catalog = extra_args.get('catalog')
-
-    # Create file download path
-    orig_source_name = source_name
-    source_name = re.sub(r"\/", "", source_name)
-    dest_dir = os.path.join(base_path, source_name)
-    if db and db == 'checksum':
-      dest_dir = os.path.join(base_path, 'Checksum')
-    if not os.path.exists(dest_dir): os.makedirs(dest_dir, exist_ok = True)
-
-    file_path = ""
-
-    # If file is in local ftp, copy from there
-    if re.search("ftp.ebi.ac.uk", file):
-      # Construct local path
-      local_file = file
-      local_file = re.sub("https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file)
-
-      # Check if local file exists
-      if os.path.exists(local_file):
-        file_path = os.path.join(dest_dir, os.path.basename(uri.path))
-        if db and db == 'checksum':
-          file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}')
-
-        if not (skip_download_if_file_present and os.path.exists(file_path)):
-          shutil.copy(local_file, file_path)
-
-          # Check if copy was successful
-          if os.path.exists(file_path):
-            logging.info(f'{orig_source_name} file copied from local FTP: {file_path}')
-            if release:
-              return file_path
+    """Class to represent the base of xref modules. Inherits the Params class."""
+
+    def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None:
+        """Calls the parent __init__ then sets some specific parameters.
+
+        Parameters
+        ----------
+        params: dict, optional
+            The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None)
+        parse_dataflow_json: bool, optional
+            Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
+        """
+        super().__init__(params, parse_dataflow_json)
+
+        self.param(
+            "metasearch_url", "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch"
+        )
+
+        # Initialize the logfile for this run (except for the Alignment module)
+        module_name = self.__class__.__name__
+        if module_name != "Alignment":
+            if self.param("log_timestamp"):
+                current_timestamp = self.param("log_timestamp")
+            else:
+                current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+            log_path = os.path.join(
+                self.param_required("base_path"), "logs", current_timestamp
+            )
+            if not os.path.exists(log_path):
+                os.makedirs(log_path, exist_ok=True)
+
+            log_file = os.path.join(
+                log_path,
+                "tmp_logfile_" + module_name + "_" + str(random.randint(0, 5000)),
+            )
+            self._log_file = log_file
+
+            console_handler = logging.StreamHandler()
+            file_handler = logging.FileHandler(log_file, mode="a")
+            console_handler.setLevel(logging.WARNING)
+            file_handler.setLevel(logging.DEBUG)
+
+            logging.basicConfig(
+                level=logging.DEBUG,
+                format="%(asctime)s | %(levelname)s | %(message)s",
+                datefmt="%d-%b-%Y %H:%M:%S",
+                handlers=[console_handler, file_handler],
+            )
+
+    def create_source_db(self, source_url: str, reuse_db_if_present: bool) -> None:
+        """Creates the xref source database from model.
+
+        Parameters
+        ----------
+        source_url: str
+            The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]
+        reuse_db_if_present: bool
+            If set to False, the database defined by provided URL will be dropped before creating a new one
+        """
+        url = make_url(source_url)
+        engine = create_engine(url, isolation_level="AUTOCOMMIT")
+
+        if url.database and reuse_db_if_present:
+            return
+
+        if database_exists(engine.url):
+            drop_database(engine.url)
+        create_database(engine.url)
+        XrefSourceDB.metadata.create_all(engine)
+
+    def download_file(self, file: str, base_path: str, source_name: str, extra_args: Dict[str, Any]) -> str:
+        """Downloads an xref file and saves into provided space.
+
+        Parameters
+        ----------
+        file: str
+            The URL of the file to download. Acceptable URL schemes: ftp, http, and https
+        base_path: str
+            The path to save the downloaded file into
+        source_name: str
+            The xref source name
+        extra_args: dict
+            Extra options, including:
+            - skip_download_if_file_present: If set to True, file is only downloaded if does not exist
+            - db: The type of external db for the xref source (only relevent here if equal to 'checksum')
+            - release: If set to 'version', then this is a version file download
+            - rel_number: The URL used to retrieve the release number (only for RefSeq)
+            - catalog: The URL used to retrieve the release catalog (only for RefSeq)
+
+        Returns
+        -------
+        The path of the downloaded file.
+
+        Raises
+        ------
+        LookupError
+            If rel_number is provided but no release number was found in URL.
+        AttributeError
+            If file URL scheme is invalid.
+        """
+        # Create uri object and get scheme
+        uri = urlparse(file)
+        if not uri.scheme:
+            return file
+
+        # Get extra parameters
+        skip_download_if_file_present = (
+            extra_args.get("skip_download_if_file_present") or False
+        )
+        db = extra_args.get("db")
+        release = extra_args.get("release")
+        rel_number = extra_args.get("rel_number")
+        catalog = extra_args.get("catalog")
+
+        # Create file download path
+        orig_source_name = source_name
+        source_name = re.sub(r"\/", "", source_name)
+        dest_dir = os.path.join(base_path, source_name)
+        if db and db == "checksum":
+            dest_dir = os.path.join(base_path, "Checksum")
+        if not os.path.exists(dest_dir):
+            os.makedirs(dest_dir, exist_ok=True)
+
+        file_path = ""
+
+        # If file is in local ftp, copy from there
+        if re.search("ftp.ebi.ac.uk", file):
+            # Construct local path
+            local_file = file
+            local_file = re.sub(
+                "https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file
+            )
+
+            # Check if local file exists
+            if os.path.exists(local_file):
+                file_path = os.path.join(dest_dir, os.path.basename(uri.path))
+                if db and db == "checksum":
+                    file_path = os.path.join(
+                        dest_dir, f"{source_name}-{os.path.basename(uri.path)}"
+                    )
+
+                if not (skip_download_if_file_present and os.path.exists(file_path)):
+                    shutil.copy(local_file, file_path)
+
+                    # Check if copy was successful
+                    if os.path.exists(file_path):
+                        logging.info(
+                            f"{orig_source_name} file copied from local FTP: {file_path}"
+                        )
+                        # if release:
+                        #   return file_path
+                        # return os.path.dirname(file_path)
+                        return file_path
+                else:
+                    logging.info(
+                        f"{orig_source_name} file already exists, skipping download ({file_path})"
+                    )
+
+        # Handle Refseq files
+        if re.search("RefSeq", source_name) and rel_number and catalog and not release:
+            # Get current release number
+            release_number = requests.get(rel_number).json()
+            if not release_number:
+                raise LookupError(f"No release number in {rel_number}")
+
+            # Get list of files in release catalog
+            catalog = re.sub(r"\*", str(release_number), catalog)
+            files_list = requests.get(catalog).text
+            refseq_files = files_list.split("\n")
+            files_to_download = []
+
+            # Download each refseq file
+            for refseq_file in refseq_files:
+                if not refseq_file:
+                    continue
+                checksum, filename = refseq_file.split("\t")
+
+                # Only interested in files matching pattern
+                if not fnmatch.fnmatch(filename, os.path.basename(uri.path)):
+                    continue
+                if re.search("nonredundant_protein", filename) or re.search(
+                    "wp_protein", filename
+                ):
+                    continue
+
+                file_path = os.path.join(dest_dir, os.path.basename(filename))
+                if os.path.exists(file_path):
+                    if skip_download_if_file_present:
+                        logging.info(
+                            f"{orig_source_name} file already exists, skipping download ({file_path})"
+                        )
+                        continue
+                    os.remove(file_path)
+
+                file_url = os.path.join(os.path.dirname(file), filename)
+                files_to_download.append({"url": file_url, "path": file_path})
+                logging.info(
+                    f"{orig_source_name} file downloaded via HTTP: {file_path}"
+                )
+
+            self.refseq_multithreading(files_to_download)
+        elif uri.scheme == "ftp":
+            ftp = FTP(uri.netloc)
+            ftp.login("anonymous", "-anonymous@")
+            ftp.cwd(os.path.dirname(uri.path))
+            remote_files = ftp.nlst()
+
+            # Download files in ftp server
+            for remote_file in remote_files:
+                # Only interested in files matching pattern
+                if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)):
+                    continue
+
+                remote_file = re.sub(r"\n", "", remote_file)
+                file_path = os.path.join(dest_dir, os.path.basename(remote_file))
+                if db and db == "checksum":
+                    file_path = os.path.join(
+                        dest_dir, f"{source_name}-{os.path.basename(remote_file)}"
+                    )
+
+                if not (skip_download_if_file_present and os.path.exists(file_path)):
+                    ftp.retrbinary("RETR " + remote_file, open(file_path, "wb").write)
+                    logging.info(
+                        f"{orig_source_name} file downloaded via FTP: {file_path}"
+                    )
+                else:
+                    logging.info(
+                        f"{orig_source_name} file already exists, skipping download ({file_path})"
+                    )
+                ftp.close()
+        elif uri.scheme == "http" or uri.scheme == "https":
+            # This is the case for the release file
+            if re.search("RefSeq", source_name) and rel_number and release:
+                # Get current release number
+                release_number = requests.get(rel_number).json()
+                if not release_number:
+                    raise LookupError(f"No release number in {rel_number}")
+
+                file = re.sub(r"\*", str(release_number), file)
+                uri = urlparse(file)
+
+            file_path = os.path.join(dest_dir, os.path.basename(uri.path))
+            if db and db == "checksum":
+                file_path = os.path.join(
+                    dest_dir, f"{source_name}-{os.path.basename(uri.path)}"
+                )
+
+            if not os.path.exists(file_path) or not skip_download_if_file_present:
+                if not skip_download_if_file_present and os.path.exists(file_path):
+                    os.remove(file_path)
+                wget.download(file, file_path)
+                logging.info(
+                    f"{orig_source_name} file downloaded via HTTP: {file_path}"
+                )
+            else:
+                logging.info(
+                    f"{orig_source_name} file already exists, skipping download ({file_path})"
+                )
+        else:
+            raise AttributeError(f"Invalid URL scheme {uri.scheme}")
+
+        # if release:
+        #   return file_path
+        # return os.path.dirname(file_path)
+        if re.search("RefSeq", source_name) and not release:
             return os.path.dirname(file_path)
+        return file_path
+
+    def refseq_multithreading(self, files: List[str]) -> None:
+        """Creates multiple threads to download RefSeq files in parallel.
+
+        Parameters
+        ----------
+        files: list
+            The list of file URLs and paths to download.
+        """
+        number_of_threads = 20
+        chunk_size = int(len(files) / number_of_threads)
+        threads = []
+
+        for thread_index in range(number_of_threads):
+            array_start = thread_index * chunk_size
+            array_end = (
+                len(files)
+                if thread_index + 1 == number_of_threads
+                else (thread_index + 1) * chunk_size
+            )
+
+            thread = threading.Thread(
+                target=self.download_refseq_files, args=(files, array_start, array_end)
+            )
+            threads.append(thread)
+            threads[thread_index].start()
+
+        for thread in threads:
+            thread.join()
+
+    def download_refseq_files(self, files: List[str], start: int, end: int) -> None:
+        """Downloads RefSeq files from a subset of files.
+
+        Parameters
+        ----------
+        files: list
+            The list of file URLs and paths to download.
+        start: int
+            The start index of the files list.
+        end: int
+            The end index of the files list.
+
+        Raises
+        ------
+        Exception
+            If file download fails all attempts.
+        """
+        for index in range(start, end):
+            failed = 0
+            file_url = files[index]["url"]
+            local_path = files[index]["path"]
+
+            for retry in range(0, 3):
+                try:
+                    wget.download(file_url, local_path)
+                except:
+                    failed += 1
+                    continue
+                break
+
+            if failed > 0:
+                raise BufferError(f"Failed to download file {file_url}")
+
+    def get_dbi(self, url: str) -> Connection:
+        """Returns a DB connection for a provided URL.
+
+        Parameters
+        ----------
+        url: str
+            The database URL to connect to
+
+        Returns
+        -------
+        An sqlalchemy engine connection.
+        """
+        connect_url = make_url(url)
+        engine = create_engine(connect_url, isolation_level="AUTOCOMMIT")
+
+        return engine.connect()
+
+    def get_db_engine(self, url: str) -> Engine:
+        """Returns a DB engine for a provided URL.
+
+        Parameters
+        ----------
+        url: str
+            The database URL to create an engine for
+
+        Returns
+        -------
+        An sqlalchemy engine.
+        """
+        connect_url = make_url(url)
+        engine = create_engine(connect_url, isolation_level="AUTOCOMMIT")
+
+        return engine
+
+    def load_checksum(self, path: str, url: str) -> None:
+        """Loads the xref checksum files into a provided database.
+        This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB.
+
+        Parameters
+        ----------
+        path: str
+            The path where the checksum files can be found
+        url: str
+            The database URL to load the checksum data into
+        """
+        checksum_dir = os.path.join(path, "Checksum")
+        if not os.path.exists(checksum_dir):
+            os.makedirs(checksum_dir, exist_ok=True)
+
+        output_files = []
+        threshold = 50000000
+        counter = 1
+        source_id = 1
+        output_fh = None
+
+        # Connect to db
+        url = url + "?local_infile=1"
+        db_engine = self.get_db_engine(url)
+        with db_engine.connect() as dbi:
+            # Get all checksum files
+            files = os.listdir(checksum_dir)
+
+            # Go through all available checksum files
+            index = 0
+            for checksum_file in files:
+                if re.search("checksum", checksum_file):
+                    continue
+
+                # Get the source name and ID
+                input_file = os.path.join(checksum_dir, checksum_file)
+                match = re.search(r"\/([A-Za-z]*)-.*$", input_file)
+                source_name = match.group(1)
+                source_id = self.get_source_id_from_name(dbi, source_name)
+
+                # Open the input file
+                input_fh = self.get_filehandle(input_file)
+                for line in input_fh:
+                    # Open the output file
+                    if not output_fh or (counter % threshold) == 0:
+                        if output_fh:
+                            output_fh.close()
+                        index += 1
+                        output_file = os.path.join(
+                            checksum_dir, f"checksum_{index}.txt"
+                        )
+                        output_files.append(output_file)
+                        output_fh = open(output_file, "w")
+
+                    line = line.rstrip()
+                    (checksum_id, checksum) = re.split(r"\s+", line)
+
+                    output = [str(counter), str(source_id), checksum_id, checksum]
+                    output_str = "\t".join(output)
+                    output_fh.write(f"{output_str}\n")
+
+                    counter += 1
+
+                input_fh.close()
+
+            if output_fh:
+                output_fh.close()
+
+            # Add the data in the files to the db
+            for output_file in output_files:
+                dbi.execute(
+                    text(
+                        f"load data local infile '{output_file}' into table checksum_xref"
+                    )
+                )
+
+            # Merge  the created files
+            merged_file = os.path.join(checksum_dir, f"checksum.txt")
+            with open(merged_file, "w") as output_fh:
+                for output_file in output_files:
+                    with open(output_file, "r") as input_fh:
+                        shutil.copyfileobj(input_fh, output_fh)
+                    os.remove(output_file)
+
+    def get_filehandle(self, filename: str) -> IO:
+        """Opens an appropriate read filehandle for a file based on its type.
+
+        Parameters
+        ----------
+        filename: str
+            The name and path of the file to read
+
+        Returns
+        -------
+        A read filehandle.
+
+        Raises
+        ------
+        FileNotFoundError
+            If no file name was provided.
+            If provided file could not be found.
+        """
+        if not filename or filename == "":
+            raise FileNotFoundError("No file name")
+
+        alt_filename = filename
+        alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename)
+        if alt_filename == filename:
+            alt_filename = alt_filename + ".gz"
+
+        if not os.path.exists(filename):
+            if not os.path.exists(alt_filename):
+                raise FileNotFoundError(
+                    f"Could not find either {filename} or {alt_filename}"
+                )
+            filename = alt_filename
+
+        if re.search(r"\.(gz|Z)$", filename):
+            fh = gzip.open(filename, "rt")
         else:
-          logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})')
-
-    # Handle Refseq files
-    if re.search("RefSeq", source_name) and rel_number and catalog and not release:
-      # Get current release number
-      release_number = requests.get(rel_number).json()
-      if not release_number:
-        raise LookupError(f'No release number in {rel_number}')
-
-      # Get list of files in release catalog
-      catalog = re.sub(r"\*", str(release_number), catalog)
-      files_list = requests.get(catalog).text
-      refseq_files = files_list.split("\n")
-      files_to_download = []
-
-      # Download each refseq file
-      for refseq_file in refseq_files:
-        if not refseq_file: continue
-        checksum, filename = refseq_file.split("\t")
-
-        # Only interested in files matching pattern
-        if not fnmatch.fnmatch(filename, os.path.basename(uri.path)): continue
-        if re.search("nonredundant_protein", filename) or re.search("wp_protein", filename): continue
-
-        file_path = os.path.join(dest_dir, os.path.basename(filename))
-        if os.path.exists(file_path):
-          if skip_download_if_file_present:
-            logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})')
-            continue
-          os.remove(file_path)
-
-        file_url = os.path.join(os.path.dirname(file), filename)
-        files_to_download.append({'url': file_url, 'path': file_path})
-        logging.info(f'{orig_source_name} file downloaded via HTTP: {file_path}')
-
-      self.refseq_multithreading(files_to_download)
-    elif uri.scheme == 'ftp':
-      ftp = FTP(uri.netloc)
-      ftp.login('anonymous', '-anonymous@')
-      ftp.cwd(os.path.dirname(uri.path))
-      remote_files = ftp.nlst()
-
-      # Download files in ftp server
-      for remote_file in remote_files:
-        # Only interested in files matching pattern
-        if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)): continue
-
-        remote_file = re.sub(r"\n", "", remote_file)
-        file_path = os.path.join(dest_dir, os.path.basename(remote_file))
-        if db and db == 'checksum':
-          file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(remote_file)}')
-
-        if not (skip_download_if_file_present and os.path.exists(file_path)):
-          ftp.retrbinary("RETR " + remote_file , open(file_path, 'wb').write)
-          logging.info(f'{orig_source_name} file downloaded via FTP: {file_path}')
+            fh = open(filename, "r")
+
+        return fh
+
+    def get_source_id_from_name(self, dbi: Connection, source_name: str) -> int:
+        """Retrieves a source ID from its name from a database.
+
+        Parameters
+        ----------
+        dbi: db connection
+            The database connection to query in
+        source_name: str
+            The name of the source
+
+        Returns
+        -------
+        The source ID.
+        """
+        source_id = dbi.execute(
+            select(SourceSORM.source_id).where(SourceSORM.name == source_name)
+        ).scalar()
+
+        return source_id
+
+    def get_file_sections(self, file: str, delimiter: str) -> Iterator[List[str]]:
+        """Reads a provided file by sections, separated by a provided delimiter.
+        This function uses 'yield' to provide the file sections one by one.
+
+        Parameters
+        ----------
+        file: str
+            The name and path of the file to read
+        delimiter: str
+            The character or string separating the file sections
+
+        Returns
+        -------
+        A yield of file sections.
+        """
+        if re.search(r"\.(gz|Z)$", file):
+            with gzip.open(file, "rt") as fh:
+                groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
+                for key, group in groups:
+                    yield list(group)
         else:
-          logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})')
-        ftp.close()
-    elif uri.scheme == 'http' or uri.scheme == 'https':
-      # This is the case for the release file
-      if re.search("RefSeq", source_name) and rel_number and release:
-        # Get current release number
-        release_number = requests.get(rel_number).json()
-        if not release_number:
-          raise LookupError(f'No release number in {rel_number}')
-
-        file = re.sub(r"\*", str(release_number), file)
-        uri = urlparse(file)
-
-      file_path = os.path.join(dest_dir, os.path.basename(uri.path))
-      if db and db == 'checksum':
-        file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}')
-
-      if not os.path.exists(file_path) or not skip_download_if_file_present:
-        if not skip_download_if_file_present and os.path.exists(file_path):
-          os.remove(file_path)
-        wget.download(file, file_path)
-        logging.info(f'{orig_source_name} file downloaded via HTTP: {file_path}')
-      else:
-        logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})')
-    else:
-      raise AttributeError(f'Invalid URL scheme {uri.scheme}')
-
-    if release:
-      return file_path
-    return os.path.dirname(file_path)
-
-  def refseq_multithreading(self, files):
-    """ Creates multiple threads to download RefSeq files in parallel.
-
-    Parameters
-    ----------
-    files: list
-        The list of file URLs and paths to download.
-    """
-    number_of_threads = 20
-    chunk_size = int(len(files) / number_of_threads)
-    threads = []
-
-    for thread_index in range(number_of_threads):
-      array_start = thread_index * chunk_size
-      array_end = len(files) if thread_index+1 == number_of_threads else (thread_index+1) * chunk_size
-
-      thread = threading.Thread(target=self.download_refseq_files, args=(files, array_start, array_end))
-      threads.append(thread)
-      threads[thread_index].start()
-
-    for thread in threads:
-      thread.join()
-
-  def download_refseq_files(self, files, start: int, end: int):
-    """ Downloads RefSeq files from a subset of files.
-
-    Parameters
-    ----------
-    files: list
-        The list of file URLs and paths to download.
-    start: int
-        The start index of the files list.
-    end: int
-        The end index of the files list.
-
-    Raises
-    ------
-    Exception
-        If file download fails all attempts.
-    """
-    for index in range(start, end):
-      failed = 0
-      file_url = files[index]['url']
-      local_path = files[index]['path']
-
-      for retry in range(0,3):
-        try:
-          wget.download(file_url, local_path)
-        except:
-          failed += 1
-          continue
-        break
-
-      if failed > 0:
-        raise Exception(f'Failed to download file {file_url}')
-
-  def get_dbi(self, url: str):
-    """ Returns a DB connection for a provided URL.
-
-    Parameters
-    ----------
-    url: str
-        The database URL to connect to
-
-    Returns
-    -------
-    An sqlalchemy engine connection.
-    """
-    connect_url = make_url(url)
-    engine = create_engine(connect_url, isolation_level="AUTOCOMMIT")
-
-    return engine.connect()
-
-  def get_db_engine(self, url: str):
-    """ Returns a DB engine for a provided URL.
-
-    Parameters
-    ----------
-    url: str
-        The database URL to create an engine for
-
-    Returns
-    -------
-    An sqlalchemy engine.
-    """
-    connect_url = make_url(url)
-    engine = create_engine(connect_url, isolation_level="AUTOCOMMIT")
-
-    return engine
-
-  def load_checksum(self, path: str, url: str):
-    """ Loads the xref checksum files into a provided database.
-    This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB.
-
-    Parameters
-    ----------
-    path: str
-        The path where the checksum files can be found
-    url: str
-        The database URL to load the checksum data into
-    """
-    checksum_dir = os.path.join(path, 'Checksum')
-    if not os.path.exists(checksum_dir): os.makedirs(checksum_dir, exist_ok = True)
-
-    # Connect to db
-    url = url + "?local_infile=1"
-    db_engine = self.get_db_engine(url)
-    with db_engine.connect() as dbi:
-      counter = 1
-      source_id = 1
-
-      # Open the checksum output file
-      files = os.listdir(checksum_dir)
-      checksum_file = os.path.join(checksum_dir, 'checksum.txt')
-      with open(checksum_file, 'w') as output_fh:
-        # Go through all available checksum files
-        for file in files:
-          if re.search("checksum", file): continue
-
-          input_file = os.path.join(checksum_dir, file)
-          match = re.search(r"\/([A-Za-z]*)-.*$", input_file)
-          source_name = match.group(1)
-          source_id = self.get_source_id_from_name(dbi, source_name)
-
-          input_fh = self.get_filehandle(input_file)
-          for line in input_fh:
-            line = line.rstrip()
-            (id, checksum) = re.split(r"\s+", line)
-
-            counter += 1
-            output = [str(counter), str(source_id), id, checksum]
-            output_str = "\t".join(output)
-            output_fh.write(f'{output_str}\n')
-
-          input_fh.close()
-
-      query = f'load data local infile \'{checksum_file}\' into table checksum_xref'
-      dbi.execute(text(query))
-
-  def get_filehandle(self, filename: str):
-    """ Opens an appropriate read filehandle for a file based on its type.
-
-    Parameters
-    ----------
-    filename: str
-        The name and path of the file to read
-
-    Returns
-    -------
-    A read filehandle.
-
-    Raises
-    ------
-    FileNotFoundError
-        If no file name was provided.
-        If provided file could not be found.
-    """
-    if not filename or filename == '':
-      raise FileNotFoundError('No file name')
-
-    alt_filename = filename
-    alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename)
-    if alt_filename == filename:
-      alt_filename = alt_filename + ".gz"
-
-    if not os.path.exists(filename):
-      if not os.path.exists(alt_filename):
-        raise FileNotFoundError(f'Could not find either {filename} or {alt_filename}')
-      filename = alt_filename
-
-    if re.search(r"\.(gz|Z)$", filename):
-      fh = gzip.open(filename, 'rt')
-    else:
-      fh = open(filename, 'r')
-
-    return fh
-
-  def get_source_id_from_name(self, dbi, source_name: str):
-    """ Retrieves a source ID from its name from a database.
-
-    Parameters
-    ----------
-    dbi: db connection
-        The database connection to query in
-    source_name: str
-        The name of the source
-
-    Returns
-    -------
-    The source ID.
-    """
-    query = select(SourceSORM.source_id).where(SourceSORM.name==source_name)
-    source_id = dbi.execute(query).scalar()
-
-    return source_id
-
-  def get_file_sections(self, file: str, delimiter: str):
-    """ Reads a provided file by sections, separated by a provided delimiter.
-    This function uses 'yield' to provide the file sections one by one.
-
-    Parameters
-    ----------
-    file: str
-        The name and path of the file to read
-    delimiter: str
-        The character or string separating the file sections
-
-    Returns
-    -------
-    A yield of file sections.
-    """
-    if re.search(r"\.(gz|Z)$", file):
-      with gzip.open(file, 'rt') as fh:
-        groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
-        for key,group in groups:
-          yield list(group)
-    else:
-      with open(file, 'r') as fh:
-        groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
-        for key,group in groups:
-          yield list(group)
-
-  def create_xref_db(self, url: str, config_file: str, preparse:bool):
-    """ Creates the xref database from model.
-    This function always drops the database defined by the provided URL (if it exists) before creating a new one.
-
-    Parameters
-    ----------
-    url: str
-        The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]
-    config_file: str
-        The name and path of the .ini file that has information about xref sources and species
-    preparse: bool
-        Specifies whether source preparsing will be done or not
-    """
-    engine = create_engine(url, isolation_level="AUTOCOMMIT")
-
-    # Drop database and create again
-    if database_exists(engine.url):
-      drop_database(engine.url)
-    create_database(engine.url)
-    XrefUpdateDB.metadata.create_all(engine)
-
-    xref_dbi = engine.connect()
-    self.populate_xref_db(xref_dbi, config_file, preparse)
-
-  def populate_xref_db(self, dbi, config_file:str, preparse:bool):
-    """ Populates the xref database with configuration data.
-
-    Parameters
-    ----------
-    dbi: db connection
-        The xref database connection
-    config_file: str
-        The name and path of the .ini file that has information about xref sources and species to populate the database with
-    preparse: bool
-        Specifies whether source preparsing will be done or not (needed to decide if to use old parsers)
-
-    Raises
-    ------
-    KeyError
-        If a source exists in a species section in the configuration file, but has no source section of its own.
-    """
-    source_ids = {}
-    source_parsers = {}
-    species_sources = {}
-
-    config = ConfigParser()
-    config.read(config_file)
-
-    species_sections, sources_sections = {}, {}
-
-    for section_name in config.sections():
-      section = config[section_name]
-      (keyword, name) = re.split(r"\s+", section_name)
-
-      if keyword == 'source':
-        sources_sections[name] = section
-      elif keyword == 'species':
-        species_sections[name] = section
-
-    # Parse species sections
-    for species_name, section in species_sections.items():
-      taxonomy_ids = section.get('taxonomy_id').split(",")
-      sources = section.get('sources')
-      aliases = section.get('aliases', species_name)
-
-      species_id = taxonomy_ids[0]
-
-      for tax_id in taxonomy_ids:
-        # Add new species
-        query = insert(SpeciesORM).values(species_id=species_id, taxonomy_id=tax_id, name=species_name, aliases=aliases)
-        dbi.execute(query)
-
-      species_sources[species_id] = sources
-
-    source_id = 0
-    # Parse source sections
-    for source_name, section in sorted(sources_sections.items()):
-      source_id += 1
-      source_name = section.get('name')
-      order = section.get('order')
-      priority = section.get('priority')
-      priority_description = section.get('prio_descr', '')
-      status = section.get('status', 'NOIDEA')
-
-      old_parser = section.get('old_parser')
-      if old_parser and not preparse:
-        parser = old_parser
-      else:
-        parser = section.get('parser')
-
-      # Add new source
-      query = insert(SourceUORM).values(name=source_name, source_release='1', ordered=order, priority=priority, priority_description=priority_description, status=status)
-      dbi.execute(query)
-
-      source_ids[source_name] = source_id
-      source_parsers[source_id] = parser
-
-    # Add source url rows
-    for species_id, sources in species_sources.items():
-      source_names = sources.split(",")
-
-      for source_name in source_names:
-        if not source_ids.get(source_name):
-          raise KeyError(f'No source section found for {source_name} in config file')
-
-        source_id = source_ids[source_name]
-        parser = source_parsers[source_id]
-        query = insert(SourceURLORM).values(source_id=source_id, species_id=species_id, parser=parser)
-        dbi.execute(query)
-
-  def get_source_id(self, dbi, parser: str, species_id: int, name: str, division_id: int):
-    """ Retrieves a source ID from its parser, species ID, name or division ID.
-
-    Parameters
-    ----------
-    dbi: db connection
-        The database connection to query in
-    parser: str
-        The source parser
-    species_id: int
-        The ID of the species related to the source
-    name: str
-        The source name
-    division_id: int
-        The ID of the division related to the source
-
-    Returns
-    -------
-    The source ID.
-    """
-    name = "%"+name+"%"
-    source_id = None
-
-    query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==species_id)
-    result = dbi.execute(query)
-    if result.rowcount == 1:
-      source_id = result.scalar()
-
-    query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==species_id).filter(SourceUORM.name.like(name))
-    result = dbi.execute(query)
-    if result.rowcount == 1:
-      source_id = result.scalar()
-
-    if not source_id:
-      query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==division_id).filter(SourceUORM.name.like(name))
-      result = dbi.execute(query).first()
-      if result:
-        source_id = result[0]
-
-    return source_id
-
-  def get_taxon_id(self, dbi):
-    """ Retrieves the species.taxonomy_id value of the meta table in a database.
-
-    Parameters
-    ----------
-    dbi: db connection
-        The database connection to query in
-
-    Returns
-    -------
-    The taxonomy ID in the database or 1 if not found.
-    """
-    query = select(MetaCORM.meta_value).where(MetaCORM.meta_key=='species.taxonomy_id')
-    result = dbi.execute(query)
-    if result.rowcount > 0:
-      return result.scalar()
-
-    return 1
-
-  def get_division_id(self, dbi):
-    """ Retrives the division ID from a database based on the species.division value of the meta table.
-
-    Parameters
-    ----------
-    dbi: db connection
-        The database connection to query in
-
-    Returns
-    -------
-    The division ID in the database or 1 if not found
-    """
-    query = select(MetaCORM.meta_value).where(MetaCORM.meta_key=='species.division')
-    result = dbi.execute(query)
-
-    if result.rowcount > 0:
-      division = result.scalar()
-
-      division_taxon = {
-        'Ensembl'            : 7742,
-        'EnsemblVertebrates' : 7742,
-        'Vertebrates'        : 7742,
-        'EnsemblMetazoa'     : 33208,
-        'Metazoa'            : 33208,
-        'Plants'             : 33090,
-        'EnsemblPlants'      : 33090,
-      }
-
-      division_id = division_taxon.get(division)
-      if division_id:
-        return division_id
-
-    return 1
-
-  def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str=None):
-    """ Creates directories based on provided data.
-
-    Parameters
-    ----------
-    base_path: str
-        The base file path
-    species: str
-        The species name
-    release: int
-        The ensEMBL release number
-    category: str
-        The file category
-    file_name: str, optional
-        The file name
-
-    Returns
-    -------
-    A file path.
-    """
-    full_path = os.path.join(base_path, species, release, category)
-    if not os.path.exists(full_path):
-      os.makedirs(full_path, exist_ok = True)
-
-    if file_name:
-      return os.path.join(full_path, file_name)
-    else:
-      return full_path
-
-  def get_db_from_registry(self, species: str, group: str, release: int, registry: str):
-    """ Looks up a db in the registry and returns an sqlaclehmy angine for it.
-
-    Parameters
-    ----------
-    species: str
-        The species name
-    group: str
-        The db group (core, ccds, otherfeatures, etc...)
-    release: int
-        The ensEMBL release number
-    registry: str
-        The registry url
-
-    Returns
-    -------
-    A db engine or 0 if no db is found.
-    """
-    # Fix registry url, if needed
-    match = re.search(r"^(.*)://(.*)", registry)
-    if match: registry = match.group(2)
-    match = re.search(r"(.*)/(.*)", registry)
-    if match: registry = match.group(1)
-
-    metasearch_url  = self.param_required('metasearch_url')
-    metasearch_body = {
-      "name_pattern":f'{species}_{group}%',
-      "filters":[
-        {
-          "meta_key":"schema_version",
-          "meta_value":release
-        },
-      ],
-      "servers":[registry]
-    }
-
-    dbs = requests.post(metasearch_url, json=metasearch_body).json()
-    dbs = dbs[registry]
-
-    if len(dbs) > 0:
-      db_url = 'mysql://' + dbs[0]
-      return db_url
-    else:
-      return 0
-
-  # def get_spark_session(self, data_type):
-  #   if data_type == 'mysql':
-  #     spark = SparkSession.builder.appName('SparkByExamples.com').config("spark.jars", "mysql-connector-java-8.0.13.jar").getOrCreate()
-  #     return spark
-  #   else:
-  #     raise Exception(f'Spark data type {data_type} not supported yet')
-
-  # def get_spark_reader(self, spark_session, data_type, data_url):
-  #   if data_type == 'mysql':
-  #     reader = spark_session.read.format("jdbc").option("driver", "com.mysql.cj.jdbc.Driver").option("url", f'jdbc:{data_url}')
-  #     return reader
-  #   else:
-  #     raise Exception(f'Spark data type {data_type} not supported yet')
-
-  def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: str=None, registry: str=None):
-    """ Retrives a mapper object based on species.
-
-    Parameters
-    ----------
-    xref_url: str
-        The xref db connection url
-    species: str
-        The species name
-    base_path: str
-        The base file path
-    release: int
-        The ensEMBL release number
-    core_db: str, optional
-        The species core db connection url
-    registry: str, optional
-        The registry url
-
-    Returns
-    -------
-    A mapper object
-    """
-    # Need either core_db or registry
-    if not core_url and not registry:
-      raise AttributeError(f'Method get_xref_mapper: need to provide either a core DB URL or a registry URL')
-
-    # Create needed db connections
-    if not core_url:
-      core_url = self.get_db_from_registry(species, 'core', release, registry)
-
-    core_db = self.get_db_engine(core_url)
-    xref_db = self.get_db_engine(xref_url)
-
-    # Extract host and dbname from xref url
-    xref_url_obj = make_url(xref_url)
-    host = xref_url_obj.host
-    dbname = xref_url_obj.database
-
-    # Locate the fasta files
-    cdna_path = self.get_path(base_path, species, release, 'ensembl', 'transcripts.fa');
-    pep_path = self.get_path(base_path, species, release, 'ensembl', 'peptides.fa');
-
-    # Try to find a species-specific mapper first
-    module_name = f'ensembl.xrefs.mappers.{species}'
-    class_name = species
-    found = importlib.find_loader(module_name)
-    if not found:
-      module_name = 'ensembl.xrefs.mappers.BasicMapper'
-      class_name = 'BasicMapper'
-
-    # Create a mapper object
-    module = importlib.import_module(module_name)
-    module_class = getattr(module, class_name)
-    mapper = module_class()
-
-    mapper.xref(xref_db)
-    mapper.add_meta_pair('xref', f'{host}:{dbname}')
-    mapper.core(core_db)
-    mapper.add_meta_pair('species', f'{host}:{dbname}')
-    mapper.dna_file(cdna_path)
-    mapper.protein_file(pep_path)
-    mapper.log_file(self._log_file)
-
-    return mapper
-
-
+            with open(file, "r") as fh:
+                groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
+                for key, group in groups:
+                    yield list(group)
+
+    def create_xref_db(self, url: str, config_file: str) -> None:
+        """Creates the xref database from model.
+        This function always drops the database defined by the provided URL (if it exists) before creating a new one.
+
+        Parameters
+        ----------
+        url: str
+            The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]
+        config_file: str
+            The name and path of the .ini file that has information about xref sources and species
+        """
+        engine = create_engine(url, isolation_level="AUTOCOMMIT")
+
+        # Drop database and create again
+        if database_exists(engine.url):
+            drop_database(engine.url)
+        create_database(engine.url)
+        XrefUpdateDB.metadata.create_all(engine)
+
+        xref_dbi = engine.connect()
+        self.populate_xref_db(xref_dbi, config_file)
+
+    def populate_xref_db(self, dbi: Connection, config_file: str) -> None:
+        """Populates the xref database with configuration data.
+
+        Parameters
+        ----------
+        dbi: db connection
+            The xref database connection
+        config_file: str
+            The name and path of the .ini file that has information about xref sources and species to populate the database with
+
+        Raises
+        ------
+        KeyError
+            If a source exists in a species section in the configuration file, but has no source section of its own.
+        """
+        source_ids, source_parsers, species_sources = {}, {}, {}
+        species_sections, sources_sections = {}, {}
+
+        config = ConfigParser()
+        config.read(config_file)
+
+        for section_name in config.sections():
+            section = config[section_name]
+            (keyword, name) = re.split(r"\s+", section_name)
+
+            if keyword == "source":
+                sources_sections[name] = section
+            elif keyword == "species":
+                species_sections[name] = section
+
+        # Parse species sections
+        for species_name, section in species_sections.items():
+            taxonomy_ids = section.get("taxonomy_id").split(",")
+            sources = section.get("sources")
+            aliases = section.get("aliases", species_name)
+
+            species_id = taxonomy_ids[0]
+
+            for tax_id in taxonomy_ids:
+                # Add new species
+                dbi.execute(
+                    insert(SpeciesORM).values(
+                        species_id=species_id,
+                        taxonomy_id=tax_id,
+                        name=species_name,
+                        aliases=aliases,
+                    )
+                )
+
+            species_sources[species_id] = sources
+
+        source_id = 0
+        # Parse source sections
+        for source_name, section in sorted(sources_sections.items()):
+            source_id += 1
+            source_db_name = section.get("name")
+            order = section.get("order")
+            priority = section.get("priority")
+            priority_description = section.get("prio_descr", "")
+            status = section.get("status", "NOIDEA")
+            parser = section.get("parser")
+
+            # Add new source
+            dbi.execute(
+                insert(SourceUORM).values(
+                    name=source_db_name,
+                    source_release="1",
+                    ordered=order,
+                    priority=priority,
+                    priority_description=priority_description,
+                    status=status,
+                )
+            )
+
+            source_ids[source_name] = source_id
+            source_parsers[source_id] = parser
+
+        # Add source url rows
+        for species_id, sources in species_sources.items():
+            source_names = sources.split(",")
+
+            for source_name in source_names:
+                if not source_ids.get(source_name):
+                    raise KeyError(
+                        f"No source section found for {source_name} in config file"
+                    )
+
+                source_id = source_ids[source_name]
+                parser = source_parsers[source_id]
+                dbi.execute(
+                    insert(SourceURLORM).values(
+                        source_id=source_id, species_id=species_id, parser=parser
+                    )
+                )
+
+    def get_source_id(self, dbi: Connection, parser: str, species_id: int, name: str, division_id: int) -> Optional[int]:
+        """Retrieves a source ID from its parser, species ID, name or division ID.
+
+        Parameters
+        ----------
+        dbi: db connection
+            The database connection to query in
+        parser: str
+            The source parser
+        species_id: int
+            The ID of the species related to the source
+        name: str
+            The source name
+        division_id: int
+            The ID of the division related to the source
+
+        Returns
+        -------
+        The source ID.
+        """
+        name = "%" + name + "%"
+        source_id = None
+
+        query = select(SourceURLORM.source_id).where(
+            SourceUORM.source_id == SourceURLORM.source_id,
+            SourceURLORM.parser == parser,
+            SourceURLORM.species_id == species_id,
+        )
+        result = dbi.execute(query)
+        if result.rowcount == 1:
+            source_id = result.scalar()
+
+        query = (
+            select(SourceURLORM.source_id)
+            .where(
+                SourceUORM.source_id == SourceURLORM.source_id,
+                SourceURLORM.parser == parser,
+                SourceURLORM.species_id == species_id,
+            )
+            .filter(SourceUORM.name.like(name))
+        )
+        result = dbi.execute(query)
+        if result.rowcount == 1:
+            source_id = result.scalar()
+
+        if not source_id:
+            query = (
+                select(SourceURLORM.source_id)
+                .where(
+                    SourceUORM.source_id == SourceURLORM.source_id,
+                    SourceURLORM.parser == parser,
+                    SourceURLORM.species_id == division_id,
+                )
+                .filter(SourceUORM.name.like(name))
+            )
+            result = dbi.execute(query).first()
+            if result:
+                source_id = result[0]
+
+        return source_id
+
+    def get_taxon_id(self, dbi: Connection) -> int:
+        """Retrieves the species.taxonomy_id value of the meta table in a database.
+
+        Parameters
+        ----------
+        dbi: db connection
+            The database connection to query in
+
+        Returns
+        -------
+        The taxonomy ID in the database or 1 if not found.
+        """
+        result = dbi.execute(
+            select(MetaCORM.meta_value).where(
+                MetaCORM.meta_key == "species.taxonomy_id"
+            )
+        )
+        if result.rowcount > 0:
+            return int(result.scalar())
+
+        return 1
+
+    def get_division_id(self, dbi: Connection) -> int:
+        """Retrives the division ID from a database based on the species.division value of the meta table.
+
+        Parameters
+        ----------
+        dbi: db connection
+            The database connection to query in
+
+        Returns
+        -------
+        The division ID in the database or 1 if not found
+        """
+        result = dbi.execute(
+            select(MetaCORM.meta_value).where(MetaCORM.meta_key == "species.division")
+        )
+
+        if result.rowcount > 0:
+            division = result.scalar()
+
+            division_taxon = {
+                "Ensembl": 7742,
+                "EnsemblVertebrates": 7742,
+                "Vertebrates": 7742,
+                "EnsemblMetazoa": 33208,
+                "Metazoa": 33208,
+                "Plants": 33090,
+                "EnsemblPlants": 33090,
+            }
+
+            division_id = division_taxon.get(division)
+            if division_id:
+                return int(division_id)
+
+        return 1
+
+    def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str = None) -> str:
+        """Creates directories based on provided data.
+
+        Parameters
+        ----------
+        base_path: str
+            The base file path
+        species: str
+            The species name
+        release: int
+            The ensEMBL release number
+        category: str
+            The file category
+        file_name: str, optional
+            The file name
+
+        Returns
+        -------
+        A file path.
+        """
+        full_path = os.path.join(base_path, species, release, category)
+        if not os.path.exists(full_path):
+            os.makedirs(full_path, exist_ok=True)
+
+        if file_name:
+            return os.path.join(full_path, file_name)
+        else:
+            return full_path
+
+    def get_db_from_registry(self, species: str, group: str, release: int, registry: str) -> Optional[str]:
+        """Looks up a db in the registry and returns an sqlaclehmy angine for it.
+
+        Parameters
+        ----------
+        species: str
+            The species name
+        group: str
+            The db group (core, ccds, otherfeatures, etc...)
+        release: int
+            The ensEMBL release number
+        registry: str
+            The registry url
+
+        Returns
+        -------
+        A db engine or 0 if no db is found.
+        """
+        # Fix registry url, if needed
+        match = re.search(r"^(.*)://(.*)", registry)
+        if match:
+            registry = match.group(2)
+        match = re.search(r"(.*)/(.*)", registry)
+        if match:
+            registry = match.group(1)
+
+        metasearch_url = self.param_required("metasearch_url")
+        metasearch_body = {
+            "name_pattern": f"{species}_{group}%",
+            "filters": [
+                {"meta_key": "schema_version", "meta_value": str(release)},
+            ],
+            "servers": [registry],
+        }
+
+        dbs = requests.post(metasearch_url, json=metasearch_body).json()
+        dbs = dbs[registry]
+
+        if len(dbs) > 0:
+            db_url = "mysql://" + dbs[0]
+            return db_url
+        else:
+            return None
+
+    def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: str = None, registry: str = None) -> BasicMapper:
+        """Retrives a mapper object based on species.
+
+        Parameters
+        ----------
+        xref_url: str
+            The xref db connection url
+        species: str
+            The species name
+        base_path: str
+            The base file path
+        release: int
+            The ensEMBL release number
+        core_db: str, optional
+            The species core db connection url
+        registry: str, optional
+            The registry url
+
+        Returns
+        -------
+        A mapper object
+        """
+        # Need either core_db or registry
+        if not core_url and not registry:
+            raise AttributeError(
+                f"Method get_xref_mapper: need to provide either a core DB URL or a registry URL"
+            )
+
+        # Create needed db connections
+        if not core_url:
+            core_url = self.get_db_from_registry(species, "core", release, registry)
+
+        core_db = self.get_db_engine(core_url)
+        xref_db = self.get_db_engine(xref_url)
+
+        # Extract host and dbname from xref url
+        xref_url_obj = make_url(xref_url)
+        host = xref_url_obj.host
+        dbname = xref_url_obj.database
+
+        # Locate the fasta files
+        cdna_path = self.get_path(
+            base_path, species, release, "ensembl", "transcripts.fa"
+        )
+        pep_path = self.get_path(base_path, species, release, "ensembl", "peptides.fa")
+
+        # Try to find a species-specific mapper first
+        module_name = f"ensembl.production.xrefs.mappers.species.{species}"
+        class_name = species
+        found = importlib.util.find_spec(module_name)
+        if not found:
+            module_name = "ensembl.production.xrefs.mappers.BasicMapper"
+            class_name = "BasicMapper"
+
+        # Create a mapper object
+        module = importlib.import_module(module_name)
+        module_class = getattr(module, class_name)
+        mapper = module_class()
+
+        mapper.xref(xref_db)
+        mapper.add_meta_pair("xref", f"{host}:{dbname}")
+        mapper.core(core_db)
+        mapper.add_meta_pair("species", f"{host}:{dbname}")
+        mapper.dna_file(cdna_path)
+        mapper.protein_file(pep_path)
+        mapper.log_file(self._log_file)
+        mapper.species_dir(os.path.join(base_path, species))
+
+        return mapper
diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py
index 7ccb401a7..7edf452e0 100644
--- a/src/python/ensembl/production/xrefs/Checksum.py
+++ b/src/python/ensembl/production/xrefs/Checksum.py
@@ -16,31 +16,31 @@
 
 from ensembl.production.xrefs.Base import *
 
-class Checksum(Base):
-  def run(self):
-    base_path     = self.param_required('base_path')
-    source_db_url = self.param_required('source_db_url')
-    skip_download = self.param_required('skip_download', {'type': 'bool'})
-
-    logging.info('Checksum starting with parameters:')
-    logging.info(f'Param: base_path = {base_path}')
-    logging.info(f'Param: source_db_url = {source_db_url}')
-    logging.info(f'Param: skip_download = {skip_download}')
-
-    # Connect to source db
-    db_engine = self.get_db_engine(source_db_url)
-
-    # Check if checksums already exist
-    table_nonempty = 0
-    if skip_download:
-      with db_engine.connect() as dbi:
-        query = select(func.count(ChecksumXrefSORM.checksum_xref_id))
-        table_nonempty = dbi.execute(query).scalar()
-
-    # Load checksums from files into db
-    if not table_nonempty:
-      self.load_checksum(base_path, source_db_url)
-      logging.info('Checksum data loaded')
-    else:
-      logging.info('Checksum data already exists, skipping loading')
 
+class Checksum(Base):
+    def run(self):
+        base_path     = self.param_required("base_path", {"type": "str"})
+        source_db_url = self.param_required("source_db_url", {"type": "str"})
+        skip_download = self.param_required("skip_download", {"type": "bool"})
+
+        logging.info("Checksum starting with parameters:")
+        logging.info(f"Param: base_path = {base_path}")
+        logging.info(f"Param: source_db_url = {source_db_url}")
+        logging.info(f"Param: skip_download = {skip_download}")
+
+        # Connect to source db
+        db_engine = self.get_db_engine(source_db_url)
+
+        # Check if checksums already exist
+        table_nonempty = 0
+        if skip_download:
+            with db_engine.connect() as dbi:
+                query = select(func.count(ChecksumXrefSORM.checksum_xref_id))
+                table_nonempty = dbi.execute(query).scalar()
+
+        # Load checksums from files into db
+        if not table_nonempty:
+            self.load_checksum(base_path, source_db_url)
+            logging.info("Checksum data loaded")
+        else:
+            logging.info("Checksum data already exists, skipping loading")
diff --git a/src/python/ensembl/production/xrefs/CoordinateMapping.py b/src/python/ensembl/production/xrefs/CoordinateMapping.py
new file mode 100644
index 000000000..d687ebee1
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/CoordinateMapping.py
@@ -0,0 +1,50 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref module to process the coordinate mappings."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.CoordinateMapper import CoordinateMapper
+
+
+class CoordinateMapping(Base):
+    def run(self):
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        scripts_dir  = self.param_required("perl_scripts_dir", {"type": "str"})
+        registry     = self.param("registry_url", None, {"type": "str"})
+        core_db_url  = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"CoordinateMapping starting for species '{species_name}'")
+
+        if not core_db_url:
+            core_db_url = self.get_db_from_registry(
+                species_name, "core", release, registry
+            )
+
+        # Get species id
+        db_engine = self.get_db_engine(core_db_url)
+        with db_engine.connect() as core_dbi:
+            species_id = self.get_taxon_id(core_dbi)
+
+        # Get the appropriate mapper
+        mapper = self.get_xref_mapper(
+            xref_db_url, species_name, base_path, release, core_db_url, registry
+        )
+
+        # Process the coordinate xrefs
+        coord = CoordinateMapper(mapper)
+        coord.run_coordinatemapping(species_name, species_id, scripts_dir)
diff --git a/src/python/ensembl/production/xrefs/DirectXrefs.py b/src/python/ensembl/production/xrefs/DirectXrefs.py
new file mode 100644
index 000000000..f6522b274
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/DirectXrefs.py
@@ -0,0 +1,39 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref module to process direct xrefs."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.DirectXrefsMapper import DirectXrefsMapper
+
+
+class DirectXrefs(Base):
+    def run(self):
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        registry     = self.param("registry_url", None, {"type": "str"})
+        core_db_url  = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"DirectXrefs starting for species '{species_name}'")
+
+        # Get the appropriate mapper
+        mapper = self.get_xref_mapper(
+            xref_db_url, species_name, base_path, release, core_db_url, registry
+        )
+
+        # Process the direct xrefs
+        direct_mappings = DirectXrefsMapper(mapper)
+        direct_mappings.process()
diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py
index 060fcb116..f3b9f20f4 100644
--- a/src/python/ensembl/production/xrefs/DownloadSource.py
+++ b/src/python/ensembl/production/xrefs/DownloadSource.py
@@ -16,48 +16,58 @@
 
 from ensembl.production.xrefs.Base import *
 
+
 class DownloadSource(Base):
-  def run(self):
-    base_path     = self.param_required('base_path')
-    parser        = self.param_required('parser')
-    name          = self.param_required('name')
-    priority      = self.param_required('priority')
-    source_db_url = self.param_required('source_db_url')
-    file          = self.param_required('file')
-    skip_download = self.param_required('skip_download', {'type': 'bool'})
-    db            = self.param('db')
-    version_file  = self.param('version_file')
-    preparse      = self.param('preparse', None, {'type': 'bool'})
-    rel_number    = self.param('rel_number')
-    catalog       = self.param('catalog')
-
-    logging.info(f'DownloadSource starting for source {name}')
-
-    # Download the main xref file
-    extra_args = {}
-    extra_args['skip_download_if_file_present'] = skip_download
-    extra_args['db'] = db
-    if rel_number and catalog:
-      extra_args['rel_number'] = rel_number
-      extra_args['catalog'] = catalog
-    file_name = self.download_file(file, base_path, name, extra_args)
-
-    # Download the version file
-    version = ""
-    if version_file:
-      extra_args['release'] = 'version'
-      version = self.download_file(version_file, base_path, name, extra_args)
-
-    # Update source db
-    db_engine = self.get_db_engine(source_db_url)
-    with db_engine.connect() as dbi:
-      query = insert(SourceSORM).values(name=name, parser=parser).prefix_with('IGNORE')
-      dbi.execute(query)
-
-      query = select(SourceSORM.source_id).where(SourceSORM.name==name)
-      source_id = dbi.execute(query).scalar()
-
-      if preparse is None: preparse = False
-      query = insert(VersionORM).values(source_id=source_id, uri=file_name, index_uri=db, count_seen=priority, revision=version, preparse=preparse).prefix_with('IGNORE')
-      dbi.execute(query)
+    def run(self):
+        base_path     = self.param_required("base_path", {"type": "str"})
+        parser        = self.param_required("parser", {"type": "str"})
+        name          = self.param_required("name", {"type": "str"})
+        priority      = self.param_required("priority", {"type": "int"})
+        source_db_url = self.param_required("source_db_url", {"type": "str"})
+        file          = self.param_required("file", {"type": "str"})
+        skip_download = self.param_required("skip_download", {"type": "bool"})
+        db            = self.param("db", None, {"type": "str"})
+        version_file  = self.param("version_file", None, {"type": "str"})
+        rel_number    = self.param("rel_number", None, {"type": "str"})
+        catalog       = self.param("catalog", None, {"type": "str"})
+
+        logging.info(f"DownloadSource starting for source {name}")
+
+        # Download the main xref file
+        extra_args = {}
+        extra_args["skip_download_if_file_present"] = skip_download
+        extra_args["db"] = db
+        if rel_number and catalog:
+            extra_args["rel_number"] = rel_number
+            extra_args["catalog"] = catalog
+        file_name = self.download_file(file, base_path, name, extra_args)
+
+        # Download the version file
+        version = ""
+        if version_file:
+            extra_args["release"] = "version"
+            version = self.download_file(version_file, base_path, name, extra_args)
+
+        # Update source db
+        db_engine = self.get_db_engine(source_db_url)
+        with db_engine.connect() as dbi:
+            dbi.execute(
+                insert(SourceSORM)
+                .values(name=name, parser=parser)
+                .prefix_with("IGNORE")
+            )
 
+            source_id = dbi.execute(
+                select(SourceSORM.source_id).where(SourceSORM.name == name)
+            ).scalar()
+            dbi.execute(
+                insert(VersionORM)
+                .values(
+                    source_id=source_id,
+                    file_path=file_name,
+                    db=db,
+                    priority=priority,
+                    revision=version,
+                )
+                .prefix_with("IGNORE")
+            )
diff --git a/src/python/ensembl/production/xrefs/DumpEnsembl.py b/src/python/ensembl/production/xrefs/DumpEnsembl.py
new file mode 100644
index 000000000..84ce39b47
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/DumpEnsembl.py
@@ -0,0 +1,81 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Dumping module to dump sequence data from a core db."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class DumpEnsembl(Base):
+    def run(self):
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        core_db_url  = self.param_required("species_db", {"type": "str"})
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        retry        = self.param("retry", None, {"type": "bool", "default": False})
+
+        logging.info(f"DumpEnsembl starting for species '{species_name}'")
+
+        # Create files paths
+        cdna_path = self.get_path(
+            base_path, species_name, release, "ensembl", "transcripts.fa"
+        )
+        pep_path = self.get_path(
+            base_path, species_name, release, "ensembl", "peptides.fa"
+        )
+
+        # Check if dumping has been done for this run before, to speed up development by not having to re-dump sequences
+        if (
+            not retry
+            and os.path.exists(cdna_path)
+            and os.path.getsize(cdna_path) > 0
+            and os.path.exists(pep_path)
+            and os.path.getsize(pep_path) > 0
+        ):
+            logging.info(
+                f"Dna and peptide data already dumped for species '{species_name}', skipping."
+            )
+        else:
+            scripts_dir = self.param_required("perl_scripts_dir")
+
+            logging.info(f"Running perl script {scripts_dir}/dump_ensembl.pl")
+            perl_cmd = f"perl {scripts_dir}/dump_ensembl.pl --cdna_path '{cdna_path}' --pep_path '{pep_path}' --species {species_name} --core_db_url '{core_db_url}' --release {release}"
+            cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE)
+
+        # Create jobs for peptide dumping and alignment
+        dataflow_params = {
+            "species_name": species_name,
+            "file_path": pep_path,
+            "xref_db_url": xref_db_url,
+            "seq_type": "peptide",
+        }
+        self.write_output("dump_xref", dataflow_params)
+
+        # Create jobs for cdna dumping and alignment
+        dataflow_params = {
+            "species_name": species_name,
+            "file_path": cdna_path,
+            "xref_db_url": xref_db_url,
+            "seq_type": "dna",
+        }
+        self.write_output("dump_xref", dataflow_params)
+
+        # Create job for schedule mapping
+        dataflow_params = {
+            "species_name": species_name,
+            "xref_db_url": xref_db_url,
+            "species_db": core_db_url,
+        }
+        self.write_output("schedule_mapping", dataflow_params)
diff --git a/src/python/ensembl/production/xrefs/DumpXref.py b/src/python/ensembl/production/xrefs/DumpXref.py
new file mode 100644
index 000000000..268c8cae2
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/DumpXref.py
@@ -0,0 +1,135 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Dumping module to dump xref sequence data from an xref intermediate db."""
+
+from ensembl.production.xrefs.Base import *
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+class DumpXref(Base):
+    def run(self):
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        file_path    = self.param_required("file_path", {"type": "str"})
+        seq_type     = self.param_required("seq_type", {"type": "str"})
+        config_file  = self.param_required("config_file", {"type": "str"})
+
+        logging.info(
+            f"DumpXref starting for species '{species_name}' with file_path '{file_path}' and seq_type '{seq_type}'"
+        )
+
+        # Connect to xref db
+        xref_dbi = self.get_dbi(xref_db_url)
+
+        # Create output path
+        full_path = self.get_path(base_path, species_name, release, "xref")
+
+        # Extract sources to download from config file
+        sources = []
+        with open(config_file) as conf_file:
+            sources = json.load(conf_file)
+
+        # Create hash of available alignment methods
+        method = {}
+        query_cutoff = {}
+        target_cutoff = {}
+        for source in sources:
+            if source.get("method"):
+                method[source["name"]] = source["method"]
+                query_cutoff[source["name"]] = source.get("query_cutoff")
+                target_cutoff[source["name"]] = source.get("target_cutoff")
+
+        job_index = 1
+
+        # Get sources related to sequence type
+        source_query = select(SourceUORM.name.distinct(), SourceUORM.source_id).where(
+            SourceUORM.source_id == XrefUORM.source_id,
+            XrefUORM.xref_id == PrimaryXrefORM.xref_id,
+            PrimaryXrefORM.sequence_type == seq_type,
+        )
+        for source in xref_dbi.execute(source_query).mappings().all():
+            source_name = source.name
+            source_id = source.source_id
+
+            if re.search(r"RefSeq_.*RNA", source_name):
+                source_name = "RefSeq_dna"
+            if re.search("RefSeq_peptide", source_name):
+                source_name = "RefSeq_peptide"
+
+            if method.get(source_name):
+                method_name = method[source_name]
+                source_query_cutoff = query_cutoff[source_name]
+                source_target_cutoff = target_cutoff[source_name]
+
+                # Open fasta file
+                file_source_name = source.name
+                file_source_name = re.sub(r"\/", "", file_source_name)
+                filename = os.path.join(
+                    full_path, f"{seq_type}_{file_source_name}_{source_id}.fasta"
+                )
+                fasta_fh = open(filename, "w")
+
+                # Get xref sequences
+                sequence_query = select(
+                    PrimaryXrefORM.xref_id, PrimaryXrefORM.sequence
+                ).where(
+                    XrefUORM.xref_id == PrimaryXrefORM.xref_id,
+                    PrimaryXrefORM.sequence_type == seq_type,
+                    XrefUORM.source_id == source_id,
+                )
+                for sequence in xref_dbi.execute(sequence_query).mappings().all():
+                    # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes
+                    seq = sequence.sequence.upper()
+                    if seq_type == "peptide":
+                        seq = re.sub(r"(J|O|U)", "X", seq)
+
+                    # Print sequence
+                    SeqIO.write(
+                        SeqRecord(Seq(seq), id=str(sequence.xref_id), description=""),
+                        fasta_fh,
+                        "fasta",
+                    )
+
+                fasta_fh.close()
+
+                # Pass data into alignment jobs
+                self.write_output(
+                    "schedule_alignment",
+                    {
+                        "species_name": species_name,
+                        "ensembl_fasta": file_path,
+                        "seq_type": seq_type,
+                        "xref_db_url": xref_db_url,
+                        "method": method_name,
+                        "query_cutoff": source_query_cutoff,
+                        "target_cutoff": source_target_cutoff,
+                        "job_index": job_index,
+                        "source_id": source_id,
+                        "source_name": source_name,
+                        "xref_fasta": filename,
+                    },
+                )
+                job_index += 1
+
+        xref_dbi.close()
+
+        if job_index == 1:
+            with open("dataflow_schedule_alignment.json", "a") as fh:
+                fh.write("")
diff --git a/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py
new file mode 100644
index 000000000..3513c7afc
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py
@@ -0,0 +1,100 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Email module to send user emails notifying of advisory DC failures."""
+
+from ensembl.production.xrefs.Base import *
+
+from smtplib import SMTP
+from email.message import EmailMessage
+
+
+class EmailAdvisoryXrefReport(Base):
+    def run(self):
+        base_path     = self.param_required("base_path", {"type": "str"})
+        release       = self.param_required("release", {"type": "int"})
+        pipeline_name = self.param_required("pipeline_name", {"type": "str"})
+        email_address = self.param_required("email", {"type": "str"})
+        email_server  = self.param_required("email_server", {"type": "str"})
+        log_timestamp = self.param("log_timestamp", None, {"type": "str"})
+
+        # Get the path and name of main reports file
+        formatted_name = re.sub(r"\s", "_", pipeline_name)
+        main_report_file_name = f"dc_report_{formatted_name}"
+        if log_timestamp:
+            log_path = os.path.join(base_path, "logs", log_timestamp)
+            main_report_file_name = f"{main_report_file_name}_{log_timestamp}.log"
+        else:
+            log_path = os.path.join(base_path, "logs")
+            if not os.path.exists(log_path):
+                os.makedir(log_path)
+            main_report_file_name = f"{main_report_file_name}.log"
+
+        main_report_file = os.path.join(log_path, main_report_file_name)
+        main_fh = open(main_report_file, "a")
+
+        species_with_reports = {}
+
+        # Get species in base path
+        species_list = os.listdir(base_path)
+
+        for species in species_list:
+            # Check if reports exist
+            dc_path = os.path.join(base_path, species, release, "dc_report")
+            if os.path.exists(dc_path):
+                # Get report files
+                dc_files = os.listdir(dc_path)
+
+                # Add each dc report into main report file
+                for dc_file in dc_files:
+                    with open(os.path.join(dc_path, dc_file), "r") as file:
+                        dc_data = file.read()
+
+                    main_fh.write(f"{dc_data}\n")
+
+                    dc_name = dc_file.replace(".log", "")
+                    if species_with_reports.get(dc_name):
+                        species_with_reports[dc_name].append(species)
+                    else:
+                        species_with_reports[dc_name] = [species]
+
+                # TO DO: maybe delete individual reports
+
+        main_fh.close()
+
+        email_message = f"Some advisory datachecks have failed for the following species in the xref pipeline run ({pipeline_name}).<br><br>"
+        for dc_name, species_list in species_with_reports.items():
+            email_message += f"Datacheck <b>{dc_name}</b>:<br>"
+            email_message += "<ul>"
+            for species_name in species_list:
+                email_message += f"<li>{species_name}</li>"
+            email_message += "</ul>"
+
+        email_message += "<br>DC failures details attached in this email."
+
+        # Send email
+        message = EmailMessage()
+        message["Subject"] = f"Advisory DC Report (release {release})"
+        message["From"] = email_address
+        message["To"] = email_address
+        message.set_content(email_message, "html")
+
+        with open(main_report_file, "rb") as fh:
+            file_data = fh.read()
+        message.add_attachment(
+            file_data, maintype="text", subtype="plain", filename=main_report_file_name
+        )
+
+        smtp = SMTP(email_server)
+        smtp.send_message(message)
diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py
index 22738d990..4295041a0 100644
--- a/src/python/ensembl/production/xrefs/EmailNotification.py
+++ b/src/python/ensembl/production/xrefs/EmailNotification.py
@@ -19,118 +19,288 @@
 from smtplib import SMTP
 from email.message import EmailMessage
 
-class EmailNotification(Base):
-  def run(self):
-    pipeline_name = self.param_required('pipeline_name')
-    base_path     = self.param_required('base_path')
-    email_address = self.param_required('email')
-    email_server  = self.param_required('email_server')
-    log_timestamp = self.param('log_timestamp')
-
-    email_message = f'The <b>{pipeline_name}</b> has completed its run.<br>'
-
-    if log_timestamp:
-      # Get the path of the log files
-      log_path = os.path.join(base_path, 'logs', log_timestamp)
 
-      # Read the log file
-      if os.path.exists(log_path):
+class EmailNotification(Base):
+    def run(self):
+        pipeline_name = self.param_required("pipeline_name", {"type": "str"})
+        base_path     = self.param_required("base_path", {"type": "str"})
+        release       = self.param_required("release", {"type": "int"})
+        email_address = self.param_required("email", {"type": "str"})
+        email_server  = self.param_required("email_server", {"type": "str"})
+        log_timestamp = self.param("log_timestamp", None, {"type": "str"})
+
+        email_message = f"The <b>{pipeline_name}</b> has completed its run.<br>"
+
+        indent = "&nbsp;&nbsp;&nbsp;"
+
+        if log_timestamp:
+            # Get the path of the log files
+            log_path = os.path.join(base_path, "logs", log_timestamp)
+
+            # Read the log file
+            if os.path.exists(log_path):
+                parameters = {}
+
+                # Copy different log files into a main one
+                main_log_file = self.combine_logs(
+                    base_path, log_timestamp, pipeline_name
+                )
+
+                # Read the full logs
+                with open(main_log_file) as fh:
+                    data = fh.read()
+
+                # Extract parameter data
+                parameters_list = re.findall(
+                    r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data
+                )
+                parameters = {param[0]: param[1] for param in parameters_list}
+
+                email_message += (
+                    "<br>The pipeline was run with the following parameters:<br>"
+                )
+                for param_name, param_value in parameters.items():
+                    if param_value == "1" or param_value == "0":
+                        param_value = bool(param_value)
+                    email_message += f"<b>{param_name}</b> = {param_value}<br>"
+
+                # Extract statistics data from logs
+                if re.search("Download", pipeline_name):
+                    sources_data, added_species, skipped_species = {}, {}, {}
+
+                    # Get sources scheduled for download
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)",
+                        data,
+                    )
+                    sources_data = {
+                        source: {"to_download": 1} for source in matches_list
+                    }
+
+                    # Get sources scheduled for cleanup
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)",
+                        data,
+                    )
+                    for source in matches_list:
+                        sources_data[source].update({"to_cleanup": 1})
+
+                    # Get sources cleaned up
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up",
+                        data,
+                    )
+                    for source in matches_list:
+                        sources_data[source].update({"cleaned_up": 1})
+
+                    # Get sources with skipped download
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)",
+                        data,
+                    )
+                    for source in matches_list:
+                        sources_data[source[0]].update(
+                            {"skipped": os.path.dirname(source[1])}
+                        )
+
+                    # Get sources downloaded
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)",
+                        data,
+                    )
+                    for source in matches_list:
+                        sources_data[source[0]].update(
+                            {"downloaded": source[1] + "|" + os.path.dirname(source[2])}
+                        )
+
+                    # Get sources copied from local ftp
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)",
+                        data,
+                    )
+                    for source in matches_list:
+                        sources_data[source[0]].update(
+                            {"copied": os.path.dirname(source[1])}
+                        )
+
+                    # Get skipped species
+                    skipped_species_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) skipped species = (\d+)",
+                        data,
+                    )
+                    skipped_species = {
+                        source[0]: source[1] for source in skipped_species_list
+                    }
+
+                    # Get species with files created
+                    added_species_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) species files created = (\d+)",
+                        data,
+                    )
+                    added_species = {
+                        source[0]: source[1] for source in added_species_list
+                    }
+
+                    # Add source statistics to email message
+                    email_message += "<br>--Source Statistics--<br>"
+                    for source_name, source_values in sources.items():
+                        email_message += f"<b>{source_name}:</b><br>"
+                        if source_values.get("to_download"):
+                            email_message += f"{indent}Scheduled for download &#10004;<br>"
+
+                        if source_values.get("downloaded"):
+                            (download_type, file_path) = source_values[
+                                "downloaded"
+                            ].split("|")
+                            email_message += f"{indent}File downloaded via {download_type} into {file_path}<br>"
+                        elif source_values.get("copied"):
+                            email_message += (
+                                indent
+                                + "File(s) copied from local FTP into %s<br>"
+                                % (source_values["copied"])
+                            )
+                        elif source_values.get("skipped"):
+                            email_message += (
+                                indent
+                                + "File(s) download skipped, already exists in %s<br>"
+                                % (source_values["skipped"])
+                            )
+
+                        if source_values.get("to_cleanup"):
+                            email_message += f"{indent}Scheduled for cleanup &#10004;<br>"
+                        if source_values.get("cleaned_up"):
+                            email_message += f"{indent}Cleaned up &#10004;<br>"
+
+                    # Add species statistics to email message
+                    email_message += "<br>--Species Statistics--<br>"
+                    email_message += "Skipped Species (files already exist):<br>"
+                    for source_name, count in skipped_species.items():
+                        email_message += f"{indent}{source_name}: {count}<br>"
+                    email_message += "Added Species (files created):<br>"
+                    for source_name, count in added_species.items():
+                        email_message += f"{indent}{source_name}: {count}<br>"
+
+                    email_message += "<br>To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--source_db_url</b>, and <b>--config_file</b> values provided to this pipeline."
+                elif re.search("Process", pipeline_name):
+                    parsed_sources, species_counts = {}, {}
+
+                    # Get species mapped
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Mapping starting for species '([\w\/]+)'",
+                        data,
+                    )
+                    for species_name in matches_list:
+                        species_counts[species_name] = {
+                            "DIRECT": 0,
+                            "INFERRED_PAIR": 0,
+                            "MISC": 0,
+                            "CHECKSUM": 0,
+                            "DEPENDENT": 0,
+                            "SEQUENCE_MATCH": 0,
+                        }
+
+                    # Get number of xrefs added per species per source
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tLoaded (\d+) ([\w\/]+) xrefs for '([\w\/]+)'",
+                        data,
+                    )
+                    for species in matches_list:
+                        count = int(species[0])
+                        xref_type = species[1]
+                        species_name = species[2]
+
+                        prev_count = species_counts[species_name][xref_type]
+                        count += prev_count
+
+                        species_counts[species_name][xref_type] = count
+
+                    # Get parsed sources per species
+                    matches_list = re.findall(
+                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'",
+                        data,
+                    )
+                    for species in matches_list:
+                        source_name = species[0]
+                        parser = species[1]
+                        species_name = species[2]
+
+                        parsed_sources[species_name].update({source_name: parser})
+
+                    # Add species statistics to email message
+                    email_message += "<br>--Species Statistics--<br>"
+                    for species_name, species_data in parsed_sources.items():
+                        email_message += f"<b>{species_name}:</b><br>"
+                        email_message += f"{indent}Sources parsed: " + ",".join(keys(species_data))
+
+                        xref_counts = species_counts[species_name]
+                        email_message += indent + "Xrefs added: "
+                        for xref_type, count in xref_counts.items():
+                            email_message += f"{count} {xref_type} "
+
+        # Send email
+        message = EmailMessage()
+        message["Subject"] = f"{pipeline_name} Finished"
+        message["From"] = email_address
+        message["To"] = email_address
+        message.set_content(email_message, "html")
+
+        smtp = SMTP(email_server)
+        smtp.send_message(message)
+
+    def combine_logs(self, base_path: str, timestamp: str, type: str) -> str:
+        ordered_processes = {
+            "download": [
+                "ScheduleDownload",
+                "DownloadSource",
+                "ScheduleCleanup",
+                "Cleanup(.*)Source",
+                "EmailNotification",
+            ],
+            "process": [
+                "ScheduleSpecies",
+                "ScheduleParse",
+                "ParseSource",
+                "(.*)Parser",
+                "DumpEnsembl",
+                "DumpXref",
+                "ScheduleAlignment",
+                "Alignment",
+                "ScheduleMapping",
+                "DirectXrefs",
+                "ProcessAlignment",
+                "RNACentralMapping",
+                "UniParcMapping",
+                "CoordinateMapping",
+                "Mapping",
+                "AdvisoryXrefReport",
+                "EmailAdvisoryXrefReport",
+                "EmailNotification",
+            ],
+        }
+        log_order = (
+            ordered_processes["download"]
+            if re.search("Download", type)
+            else ordered_processes["process"]
+        )
+
+        log_path = os.path.join(base_path, "logs", timestamp)
         log_files = os.listdir(log_path)
 
-        parameters, sources, added_species, skipped_species = {}, {}, {}, {}
-
-        main_log_file = os.path.join(base_path, 'logs', log_timestamp, 'logfile_'+log_timestamp)
+        main_log_file = os.path.join(
+            base_path, "logs", timestamp, "logfile_" + timestamp
+        )
 
         # Copy different log files into a main one
-        with open(main_log_file, 'a') as out_fh:
-          for log_file in log_files:
-            if not re.search(r"^tmp_", log_file): continue
-            log_file = os.path.join(log_path, log_file)
-            with open(log_file) as in_fh:
-              log_data = in_fh.read()
-              out_fh.write(log_data)
-            os.remove(log_file)
-
-        # Read the full logs
-        with open(main_log_file) as fh:
-          data = fh.read()
-
-        # Extract parameter data
-        parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data)
-        parameters = {param[0]: param[1] for param in parameters_list}
-
-        email_message += '<br>The pipeline was run with the following parameters:<br>'
-        for param_name,param_value in parameters.items():
-          email_message += f'<b>{param_name}</b> = {param_value}<br>'
-
-        if re.search('Download', pipeline_name):
-          #Extract data from logs
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", data)
-          sources = {source : {'to_download' : 1} for source in sources_list}
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", data)
-          for source in sources_list: sources[source].update({'to_cleanup' : 1})
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to preparse: ([\w\/]+)", data)
-          for source in sources_list: sources[source].update({'to_preparse' : 1})
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", data)
-          for source in sources_list: sources[source].update({'cleaned_up' : 1})
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) preparsed", data)
-          for source in sources_list: sources[source].update({'preparsed' : 1})
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", data)
-          for source in sources_list: sources[source[0]].update({'skipped' : os.path.dirname(source[1])})
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", data)
-          for source in sources_list: sources[source[0]].update({'downloaded' : source[1]+"|"+os.path.dirname(source[2])})
-
-          sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", data)
-          for source in sources_list: sources[source[0]].update({'copied' : os.path.dirname(source[1])})
-
-          skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data)
-          skipped_species = {source[0]: source[1] for source in skipped_species_list}
-
-          added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) species files created = (\d+)", data)
-          added_species = {source[0]: source[1] for source in added_species_list}
-
-          # Include source statistics
-          email_message += '<br>--Source Statistics--<br>'
-          for source_name,source_values in sources.items():
-            email_message += f'<b>{source_name}:</b><br>'
-            if source_values.get('to_download'): email_message += '&nbsp;&nbsp;&nbsp;Scheduled for download &#10004;<br>'
-
-            if source_values.get('downloaded'):
-              (download_type, file_path) = source_values['downloaded'].split("|")
-              email_message += f'&nbsp;&nbsp;&nbsp;File downloaded via {download_type} into {file_path}<br>'
-            elif source_values.get('copied'): email_message += '&nbsp;&nbsp;&nbsp;File(s) copied from local FTP into %s<br>' % (source_values['copied'])
-            elif source_values.get('skipped'): email_message += '&nbsp;&nbsp;&nbsp;File(s) download skipped, already exists in %s<br>' % (source_values['skipped'])
-
-            if source_values.get('to_cleanup'): email_message += '&nbsp;&nbsp;&nbsp;Scheduled for cleanup &#10004;<br>'
-            if source_values.get('cleaned_up'): email_message += '&nbsp;&nbsp;&nbsp;Cleaned up &#10004;<br>'
-
-            if source_values.get('to_preparse'): email_message += '&nbsp;&nbsp;&nbsp;Scheduled for pre-parse &#10004;<br>'
-            if source_values.get('preparsed'): email_message += '&nbsp;&nbsp;&nbsp;Pre-parsed &#10004;<br>'
-
-          # Include species statistics
-          email_message += '<br>--Species Statistics--<br>'
-          email_message += 'Skipped Species (files already exist):<br>'
-          for source_name, count in skipped_species.items():
-            email_message += f'&nbsp;&nbsp;&nbsp;{source_name}: {count}<br>'
-          email_message += 'Added Species (files created):<br>'
-          for source_name, count in added_species.items():
-            email_message += f'&nbsp;&nbsp;&nbsp;{source_name}: {count}<br>'
-
-          email_message += '<br>To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--base_path</b>, <b>--source_db_url</b>, and <b>--central_db_url</b> (if preparse was run) values provided to this pipeline.'
-
-    # Send email
-    message = EmailMessage()
-    message['Subject'] = f'{pipeline_name} Finished'
-    message['From'] = email_address
-    message['To'] = email_address
-    message.set_content(email_message, 'html')
-
-    smtp = SMTP(email_server)
-    smtp.send_message(message)
-
+        with open(main_log_file, "a") as out_fh:
+            for pattern in log_order:
+                pattern = r"^tmp_logfile_" + pattern + r"_\d+"
+                matches = [s for s in log_files if re.search(pattern, s)]
+
+                for log_file in matches:
+                    log_file = os.path.join(log_path, log_file)
+                    with open(log_file) as in_fh:
+                        log_data = in_fh.read()
+                        out_fh.write(log_data)
+                    os.remove(log_file)
+
+        return main_log_file
diff --git a/src/python/ensembl/production/xrefs/Mapping.py b/src/python/ensembl/production/xrefs/Mapping.py
new file mode 100644
index 000000000..838470c1a
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/Mapping.py
@@ -0,0 +1,91 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapping module to map the added xrefs into the core DB."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.ProcessPriorities import ProcessPriorities
+from ensembl.production.xrefs.mappers.ProcessPaired import ProcessPaired
+from ensembl.production.xrefs.mappers.ProcessMoves import ProcessMoves
+from ensembl.production.xrefs.mappers.OfficialNaming import OfficialNaming
+from ensembl.production.xrefs.mappers.TestMappings import TestMappings
+from ensembl.production.xrefs.mappers.XrefLoader import XrefLoader
+from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
+
+
+class Mapping(Base):
+    def run(self):
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        registry     = self.param("registry_url", None, {"type": "str"})
+        core_db_url  = self.param("species_db", None, {"type": "str"})
+        verbose      = self.param("verbose", None, {"default": False})
+
+        logging.info(f"Mapping starting for species '{species_name}'")
+
+        if not core_db_url:
+            core_db_url = self.get_db_from_registry(
+                species_name, "core", release, registry
+            )
+
+        # Get species id
+        db_engine = self.get_db_engine(core_db_url)
+        with db_engine.connect() as core_dbi:
+            species_id = self.get_taxon_id(core_dbi)
+
+        # Get the appropriate mapper
+        mapper = self.get_xref_mapper(
+            xref_db_url, species_name, base_path, release, core_db_url, registry
+        )
+
+        # Process the xref priorities
+        priorities = ProcessPriorities(mapper)
+        priorities.process()
+
+        # Process the paired xrefs
+        paired = ProcessPaired(mapper)
+        paired.process()
+
+        # Process the needed xref moves
+        mover = ProcessMoves(mapper)
+        mover.biomart_testing(verbose)
+        mover.source_defined_move(verbose)
+        mover.process_alt_alleles(verbose)
+
+        # Set the official names for select species
+        naming = OfficialNaming(mapper)
+        naming.run(species_id, verbose)
+
+        # Test the validity of the data before mapping into the core DB
+        warnings = 0
+        logging.info("Testing mappings")
+        tester = TestMappings(mapper)
+        warnings += tester.direct_stable_id_check()
+        warnings += tester.xrefs_counts_check()
+        warnings += tester.name_change_check(mapper.official_name())
+
+        # Map xref data onto the core DB
+        loader = XrefLoader(mapper)
+        loader.update(species_name)
+
+        # Set the display xrefs
+        display = DisplayXrefs(mapper)
+        display.build_display_xrefs()
+
+        # Pass datachecks data
+        dataflow_params = {"species_name": species_name, "species_db": core_db_url}
+
+        self.write_output("datacheck", dataflow_params)
diff --git a/src/python/ensembl/production/xrefs/ParseSource.py b/src/python/ensembl/production/xrefs/ParseSource.py
new file mode 100644
index 000000000..d3024fe20
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/ParseSource.py
@@ -0,0 +1,90 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parsing module to call specific file/db parsers based on xref source."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class ParseSource(Base):
+    def run(self):
+        parser_name  = self.param_required("parser", {"type": "str"})
+        species_name = self.param_required("species_name", {"type": "str"})
+        species_id   = self.param_required("species_id", {"type": "int"})
+        file_name    = self.param_required("file_name", {"type": "str"})
+        source_id    = self.param_required("source_id", {"type": "int"})
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        registry     = self.param_required("registry_url", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        core_db_url  = self.param_required("core_db_url", {"type": "str"})
+        db           = self.param("db", None, {"type": "str"})
+        release_file = self.param("release_file", None, {"type": "str"})
+        source_name  = self.param("source_name", None, {"type": "str"})
+
+        logging.info(
+            f"ParseSource starting for source '{source_name}' with parser '{parser_name}' for species '{species_name}'"
+        )
+
+        failure = 0
+        message = None
+
+        # Set parser arguments
+        args = {
+            "source_id": source_id,
+            "species_id": species_id,
+            "rel_file": release_file,
+            "species_name": species_name,
+            "file": file_name,
+        }
+
+        # Connect to xref db
+        xref_dbi = self.get_dbi(xref_db_url)
+        args["xref_dbi"] = xref_dbi
+
+        # Get the extra db, if any
+        if db:
+            dba = self.param(f"{db}_db_url")
+            if not dba:
+                dba = self.get_db_from_registry(species_name, db, release, registry)
+
+            args["dba"] = dba
+            args["ensembl_release"] = release
+            args["core_db_url"] = core_db_url
+
+        # For RefSeqCoordinate source, we run a perl script
+        if parser_name == "RefSeqCoordinateParser":
+            args["perl_scripts_dir"] = self.param_required("perl_scripts_dir")
+            args["xref_db_url"] = xref_db_url
+
+        # For UniProt we need the hgnc file to extract descriptions
+        if re.search(r"^UniProt", parser_name):
+            args['hgnc_file'] = self.param("hgnc_file", None, {"type": "str"})
+
+        # Import the parser
+        module_name = f"ensembl.production.xrefs.parsers.{parser_name}"
+        module = importlib.import_module(module_name)
+        parser_class = getattr(module, parser_name)
+        parser = parser_class()
+
+        (errors, message) = parser.run(args)
+        failure += errors
+
+        xref_dbi.close()
+
+        if failure:
+            raise Exception(f"Parser '{parser_name}' failed with message: {message}")
+
+        logging.info(
+            f"Source '{source_name}' parsed for species '{species_name}' with the following message:\n{message}"
+        )
diff --git a/src/python/ensembl/production/xrefs/ProcessAlignment.py b/src/python/ensembl/production/xrefs/ProcessAlignment.py
new file mode 100644
index 000000000..1f2295d43
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/ProcessAlignment.py
@@ -0,0 +1,37 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref module to process the sequence matched allignments."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.ProcessMappings import ProcessMappings
+
+
+class ProcessAlignment(Base):
+    def run(self):
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        registry     = self.param("registry_url", None, {"type": "str"})
+        core_db_url  = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"ProcessAlignment starting for species '{species_name}'")
+
+        # Get the appropriate mapper
+        mapper = self.get_xref_mapper(
+            xref_db_url, species_name, base_path, release, core_db_url, registry
+        )
+
+        # Process the alignments
+        mappings = ProcessMappings(mapper)
+        mappings.process_mappings()
diff --git a/src/python/ensembl/production/xrefs/RNACentralMapping.py b/src/python/ensembl/production/xrefs/RNACentralMapping.py
new file mode 100644
index 000000000..e71353f50
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/RNACentralMapping.py
@@ -0,0 +1,62 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref module to process the RNAcentral mappings."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.RNACentralMapper import RNACentralMapper
+from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum
+
+
+class RNACentralMapping(Base):
+    def run(self):
+        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
+        species_name  = self.param_required("species_name", {"type": "str"})
+        base_path     = self.param_required("base_path", {"type": "str"})
+        release       = self.param_required("release", {"type": "int"})
+        source_db_url = self.param_required("source_db_url", {"type": "str"})
+        registry      = self.param("registry_url", None, {"type": "str"})
+        core_db_url   = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"RNACentralMapping starting for species '{species_name}'")
+
+        if not core_db_url:
+            core_db_url = self.get_db_from_registry(
+                species_name, "core", release, registry
+            )
+
+        # Get species id
+        db_engine = self.get_db_engine(core_db_url)
+        with db_engine.connect() as core_dbi:
+            species_id = self.get_taxon_id(core_dbi)
+
+        # Get the rna central mapper
+        mapper = RNACentralMapper(
+            self.get_xref_mapper(
+                xref_db_url, species_name, base_path, release, core_db_url, registry
+            )
+        )
+
+        # Get source id
+        db_engine = self.get_db_engine(source_db_url)
+        with db_engine.connect() as source_dbi:
+            source_id = self.get_source_id_from_name(source_dbi, "RNACentral")
+
+            method = MySQLChecksum({"MAPPER": mapper})
+            results = method.run(
+                mapper.target(), source_id, mapper.object_type(), source_dbi
+            )
+
+        if results:
+            mapper.upload(results, species_id)
diff --git a/src/python/ensembl/production/xrefs/ScheduleAlignment.py b/src/python/ensembl/production/xrefs/ScheduleAlignment.py
new file mode 100644
index 000000000..d1fca7697
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/ScheduleAlignment.py
@@ -0,0 +1,73 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Scheduling module to create xref/ensEMBL alignment jobs."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class ScheduleAlignment(Base):
+    def run(self):
+        species_name  = self.param_required("species_name", {"type": "str"})
+        release       = self.param_required("release", {"type": "int"})
+        target_file   = self.param_required("ensembl_fasta", {"type": "str"})
+        source_file   = self.param_required("xref_fasta", {"type": "str"})
+        seq_type      = self.param_required("seq_type", {"type": "str"})
+        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
+        base_path     = self.param_required("base_path", {"type": "str"})
+        method        = self.param_required("method", {"type": "str"})
+        query_cutoff  = self.param_required("query_cutoff", {"type": "int"})
+        target_cutoff = self.param_required("target_cutoff", {"type": "int"})
+        source_id     = self.param_required("source_id", {"type": "int"})
+        source_name   = self.param_required("source_name", {"type": "str"})
+        job_index     = self.param_required("job_index", {"type": "int"})
+
+        logging.info(
+            f"ScheduleAlignment starting for species '{species_name}' with seq_type '{seq_type}' and job_index '{job_index}'"
+        )
+
+        # Inspect file size to decide on chunking
+        size = os.stat(target_file).st_size
+        chunks = int(size / 1000000) + 1
+
+        # Create output path
+        output_path = self.get_path(base_path, species_name, release, "alignment")
+
+        # Pass alignment data for each chunk
+        chunklet = 1
+        while chunklet <= chunks:
+            output_path_chunk = os.path.join(
+                output_path,
+                f"{seq_type}_alignment_{source_id}_{chunklet}_of_{chunks}.map",
+            )
+            self.write_output(
+                "alignment",
+                {
+                    "species_name": species_name,
+                    "align_method": method,
+                    "query_cutoff": query_cutoff,
+                    "target_cutoff": target_cutoff,
+                    "max_chunks": chunks,
+                    "chunk": chunklet,
+                    "job_index": job_index,
+                    "source_file": source_file,
+                    "target_file": target_file,
+                    "xref_db_url": xref_db_url,
+                    "map_file": output_path_chunk,
+                    "source_id": source_id,
+                    "source_name": source_name,
+                    "seq_type": seq_type,
+                },
+            )
+            chunklet += 1
diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
index 58396b33a..eeddf94e1 100644
--- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py
+++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
@@ -16,42 +16,48 @@
 
 from ensembl.production.xrefs.Base import *
 
-class ScheduleCleanup(Base):
-  def run(self):
-    base_path              = self.param_required('base_path')
-    source_db_url          = self.param_required('source_db_url')
-    clean_files            = self.param('clean_files')
-    clean_dir              = self.param('clean_dir')
-    split_files_by_species = self.param('split_files_by_species')
-
-    logging.info('ScheduleCleanup starting with parameters:')
-    logging.info(f'Param: base_path = {base_path}')
-    logging.info(f'Param: source_db_url = {source_db_url}')
-    logging.info(f'Param: clean_files = {clean_files}')
-    logging.info(f'Param: clean_dir = {clean_dir}')
-    logging.info(f'Param: split_files_by_species = {split_files_by_species}')
-
-    # Connect to source db
-    db_engine = self.get_db_engine(source_db_url)
-    with db_engine.connect() as dbi:
-      # Get name and version file for each source
-      query = select(SourceSORM.name, VersionORM.revision).where(SourceSORM.source_id==VersionORM.source_id).distinct()
-      sources = dbi.execute(query).mappings().all()
-
-    for source in sources:
-      # Only cleaning RefSeq and UniProt for now
-      if not (re.search(r"^RefSeq_(dna|peptide)", source.name) or re.search(r"^Uniprot", source.name)): continue
-
-      # Remove / char from source name to access directory
-      clean_name = source.name
-      clean_name = re.sub(r"\/", "", clean_name)
-
-      # Send parameters into cleanup jobs for each source
-      if os.path.exists(os.path.join(base_path, clean_name)):
-        logging.info(f'Source to cleanup: {source.name}')
-
-        self.write_output('cleanup_sources', {
-          'name'         : source.name,
-          'version_file' : source.revision
-        })
 
+class ScheduleCleanup(Base):
+    def run(self):
+        base_path              = self.param_required("base_path", {"type": "str"})
+        source_db_url          = self.param_required("source_db_url", {"type": "str"})
+        clean_files            = self.param("clean_files", None, {"type": "bool"})
+        clean_dir              = self.param("clean_dir", None, {"type": "str"})
+        split_files_by_species = self.param("split_files_by_species", None, {"type": "bool"})
+
+        logging.info("ScheduleCleanup starting with parameters:")
+        logging.info(f"Param: base_path = {base_path}")
+        logging.info(f"Param: source_db_url = {source_db_url}")
+        logging.info(f"Param: clean_files = {clean_files}")
+        logging.info(f"Param: clean_dir = {clean_dir}")
+        logging.info(f"Param: split_files_by_species = {split_files_by_species}")
+
+        # Connect to source db
+        db_engine = self.get_db_engine(source_db_url)
+        with db_engine.connect() as dbi:
+            # Get name and version file for each source
+            query = select(SourceSORM.name.distinct(), VersionORM.revision).where(
+                SourceSORM.source_id == VersionORM.source_id
+            )
+            sources = dbi.execute(query).mappings().all()
+
+        for source in sources:
+            # Only cleaning RefSeq and UniProt for now
+            if not (
+                re.search(r"^RefSeq_(dna|peptide)", source.name)
+                or re.search(r"^Uniprot", source.name)
+            ):
+                continue
+
+            # Remove / char from source name to access directory
+            clean_name = source.name
+            clean_name = re.sub(r"\/", "", clean_name)
+
+            # Send parameters into cleanup jobs for each source
+            if os.path.exists(os.path.join(base_path, clean_name)):
+                logging.info(f"Source to cleanup: {source.name}")
+
+                self.write_output(
+                    "cleanup_sources",
+                    {"name": source.name, "version_file": source.revision},
+                )
diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py
index 8001bccc8..f9af93454 100644
--- a/src/python/ensembl/production/xrefs/ScheduleDownload.py
+++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py
@@ -16,58 +16,54 @@
 
 from ensembl.production.xrefs.Base import *
 
-class ScheduleDownload(Base):
-  def run(self):
-    config_file     = self.param_required('config_file')
-    source_db_url   = self.param_required('source_db_url')
-    reuse_db        = self.param_required('reuse_db', {'type': 'bool'})
-    skip_preparse   = self.param('skip_preparse', None, {'type': 'bool', 'default' : False})
-
-    logging.info('ScheduleDownload starting with parameters:')
-    logging.info(f'Param: config_file = {config_file}')
-    logging.info(f'Param: source_db_url = {source_db_url}')
-    logging.info(f'Param: reuse_db = {reuse_db}')
-    logging.info(f'Param: skip_preparse = {skip_preparse}')
 
-    # Create the source db from url
-    self.create_source_db(source_db_url, reuse_db)
+class ScheduleDownload(Base):
+    def run(self):
+        config_file   = self.param_required("config_file", {"type": "str"})
+        source_db_url = self.param_required("source_db_url", {"type": "str"})
+        reuse_db      = self.param_required("reuse_db", {"type": "bool"})
 
-    # Extract sources to download from config file
-    sources = []
-    with open(config_file) as conf_file:
-      sources = json.load(conf_file)
+        logging.info("ScheduleDownload starting with parameters:")
+        logging.info(f"Param: config_file = {config_file}")
+        logging.info(f"Param: source_db_url = {source_db_url}")
+        logging.info(f"Param: reuse_db = {reuse_db}")
 
-    if len(sources) < 1:
-      raise IOError(f'No sources found in config file {config_file}. Need sources to run pipeline')
+        # Create the source db from url
+        self.create_source_db(source_db_url, reuse_db)
 
-    for source_data in sources:
-      name         = source_data['name']
-      parser       = source_data['parser']
-      priority     = source_data['priority']
-      file         = source_data['file']
-      db           = source_data.get('db')
-      version_file = source_data.get('release')
-      preparse     = source_data.get('preparse')
-      rel_number   = source_data.get('release_number')
-      catalog      = source_data.get('catalog')
+        # Extract sources to download from config file
+        sources = []
+        with open(config_file) as conf_file:
+            sources = json.load(conf_file)
 
-      logging.info(f'Source to download: {name}')
+        if len(sources) < 1:
+            raise IOError(
+                f"No sources found in config file {config_file}. Need sources to run pipeline"
+            )
 
-      # Revert to the old parser if not pre-parsing
-      if preparse and skip_preparse:
-        parser = source_data['old_parser']
-        preparse = 0
+        for source_data in sources:
+            name = source_data["name"]
+            parser = source_data["parser"]
+            priority = source_data["priority"]
+            file = source_data["file"]
+            db = source_data.get("db")
+            version_file = source_data.get("release")
+            rel_number = source_data.get("release_number")
+            catalog = source_data.get("catalog")
 
-      # Pass the source parameters into download jobs
-      self.write_output('sources', {
-        'parser'       : parser,
-        'name'         : name,
-        'priority'     : priority,
-        'db'           : db,
-        'version_file' : version_file,
-        'preparse'     : preparse,
-        'file'         : file,
-        'rel_number'   : rel_number,
-        'catalog'      : catalog
-      })
+            logging.info(f"Source to download: {name}")
 
+            # Pass the source parameters into download jobs
+            self.write_output(
+                "sources",
+                {
+                    "parser": parser,
+                    "name": name,
+                    "priority": priority,
+                    "db": db,
+                    "version_file": version_file,
+                    "file": file,
+                    "rel_number": rel_number,
+                    "catalog": catalog,
+                },
+            )
diff --git a/src/python/ensembl/production/xrefs/ScheduleMapping.py b/src/python/ensembl/production/xrefs/ScheduleMapping.py
new file mode 100644
index 000000000..44032ad76
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/ScheduleMapping.py
@@ -0,0 +1,56 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Scheduling module to xref mapping jobs."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.CoreInfo import CoreInfo
+
+
+class ScheduleMapping(Base):
+    def run(self):
+        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
+        species_name = self.param_required("species_name", {"type": "str"})
+        base_path    = self.param_required("base_path", {"type": "str"})
+        release      = self.param_required("release", {"type": "int"})
+        registry     = self.param("registry_url", None, {"type": "str"})
+        core_db_url  = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"ScheduleMapping starting for species '{species_name}'")
+
+        # Get the appropriate mapper
+        mapper = self.get_xref_mapper(
+            xref_db_url, species_name, base_path, release, core_db_url, registry
+        )
+
+        # Load the core data
+        logging.info("Loading core data")
+        core_info = CoreInfo(mapper)
+        core_info.get_core_data()
+        core_info.get_alt_alleles()
+
+        if not core_db_url:
+            core_db_url = self.get_db_from_registry(
+                species_name, "core", release, registry
+            )
+
+        # Pass mapping data
+        dataflow_params = {
+            "xref_db_url": xref_db_url,
+            "species_name": species_name,
+            "species_db": core_db_url,
+        }
+
+        self.write_output("pre_mapping", dataflow_params)
+        self.write_output("mapping", dataflow_params)
diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py
new file mode 100644
index 000000000..cf044e1ee
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/ScheduleParse.py
@@ -0,0 +1,219 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Scheduling module to create parsing jobs for each xref source."""
+
+import glob
+
+from ensembl.production.xrefs.Base import *
+
+
+class ScheduleParse(Base):
+    def run(self):
+        species_name     = self.param_required("species_name", {"type": "str"})
+        release          = self.param_required("release", {"type": "int"})
+        registry         = self.param_required("registry_url", {"type": "str"})
+        order_priority   = self.param_required("priority", {"type": "int"})
+        source_db_url    = self.param_required("source_db_url", {"type": "str"})
+        xref_db_url      = self.param_required("xref_db_url", {"type": "str"})
+        get_species_file = self.param_required("get_species_file", {"type": "bool"})
+        core_db_url      = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"ScheduleParse starting for species '{species_name}'")
+        logging.info(f"\tParam: order_priority = {order_priority}")
+        logging.info(f"\tParam: source_db_url = {source_db_url}")
+        logging.info(f"\tParam: xref_db_url = {xref_db_url}")
+        logging.info(f"\tParams: core_db_url = {core_db_url}")
+
+        dataflow_suffix, dataflow_sub_suffix = "", ""
+
+        # Create Xref database only at priority 1 (one time)
+        if order_priority == 1:
+            sources_config_file = self.param_required("sources_config_file")
+            logging.info(f"\tParam: sources_config_file = {sources_config_file}")
+
+            # Construct xref update url
+            xref_db_url = make_url(xref_db_url)
+            xref_db_url = xref_db_url.set(
+                database=f"{species_name}_xref_update_{release}"
+            )
+            self.create_xref_db(xref_db_url, sources_config_file)
+            xref_db_url = xref_db_url.render_as_string(hide_password=False)
+
+            dataflow_suffix = "primary_sources"
+            dataflow_sub_suffix = "schedule_secondary"
+        elif order_priority == 2:
+            dataflow_suffix = "secondary_sources"
+            dataflow_sub_suffix = "schedule_tertiary"
+        elif order_priority == 3:
+            dataflow_suffix = "tertiary_sources"
+            dataflow_sub_suffix = "dump_ensembl"
+        else:
+            raise AttributeError("Parameter 'priority' can only be of value 1, 2, or 3")
+
+        # Get core db from registry if not provided
+        if not core_db_url:
+            core_db_url = self.get_db_from_registry(
+                species_name, "core", release, registry
+            )
+        if not re.search(r"^mysql://", core_db_url):
+            core_db_url = "mysql://" + core_db_url
+
+        # Get species and division ids
+        db_engine = self.get_db_engine(core_db_url)
+        with db_engine.connect() as core_dbi:
+            species_id = self.get_taxon_id(core_dbi)
+            division_id = self.get_division_id(core_dbi)
+
+        # Retrieve list of sources from source database
+        db_engine = self.get_db_engine(source_db_url)
+        with db_engine.connect() as source_dbi:
+            query = (
+                select(
+                    SourceSORM.name.distinct(),
+                    SourceSORM.parser,
+                    VersionORM.file_path,
+                    VersionORM.clean_path,
+                    VersionORM.db,
+                    VersionORM.priority,
+                    VersionORM.revision,
+                )
+                .where(SourceSORM.source_id == VersionORM.source_id)
+                .order_by(SourceSORM.name)
+            )
+            sources = source_dbi.execute(query).mappings().all()
+
+        # Connect to the xref intermediate db
+        xref_dbi = self.get_dbi(xref_db_url)
+
+        hgnc_path = None
+        total_sources = 0
+
+        for source in sources:
+            if source.name == "HGNC":
+                hgnc_path = source.file_path
+
+            if source.db == "checksum":
+                continue
+            if source.priority != order_priority:
+                continue
+
+            dataflow_params = {
+                "species_name": species_name,
+                "species_id": species_id,
+                "core_db_url": core_db_url,
+                "xref_db_url": xref_db_url,
+            }
+
+            # Use clean files if available
+            file_name = source.file_path
+            if source.clean_path:
+                file_name = source.clean_path
+
+            # Some sources are species-specific
+            source_id = self.get_source_id(
+                xref_dbi, source.parser, species_id, source.name, division_id
+            )
+            if not source_id:
+                continue
+
+            dataflow_params["source_id"] = source_id
+            dataflow_params["source_name"] = source.name
+            dataflow_params["parser"] = source.parser
+            if source.revision:
+                dataflow_params["release_file"] = source.revision
+
+            # Some sources need a connection to a special database
+            if source.db:
+                dataflow_params["db"] = source.db
+
+                if source.db != "core":
+                    db_url = self.get_db_from_registry(
+                        species_name, source.db, release, registry
+                    )
+                    if not db_url:
+                        # Not all species have an otherfeatures database
+                        if source.db == "otherfeatures":
+                            continue
+                        else:
+                            raise LookupError(
+                                f"Cannot use {source.parser} for {species_name}, no {source.db} database"
+                            )
+                    else:
+                        dataflow_params[f"{source.db}_db_url"] = db_url
+
+            logging.info(
+                f"Parser '{source.parser}' for source '{source.name}' scheduled for species '{species_name}'"
+            )
+
+            if file_name == "Database":
+                dataflow_params["file_name"] = file_name
+                self.write_output(dataflow_suffix, dataflow_params)
+                total_sources += 1
+            else:
+                # Get list of files if directory
+                if os.path.isdir(file_name):
+                    list_files = os.listdir(file_name)
+                    list_files = [os.path.join(file_name, f) for f in list_files]
+                else:
+                    list_files = [file_name]
+
+                # For Uniprot and Refseq, files might have been split by species
+                if get_species_file:
+                    match source.name:
+                        case "Uniprot/SWISSPROT":
+                            file_prefix = "uniprot_sprot"
+                        case "Uniprot/SPTREMBL":
+                            file_prefix = "uniprot_trembl"
+                        case "RefSeq_dna":
+                            file_prefix = "refseq_rna"
+                        case "RefSeq_peptide":
+                            file_prefix = "refseq_protein"
+                        case _:
+                            file_prefix = None
+
+                    if file_prefix:
+                        list_files = glob.glob(
+                            file_name + "/**/" + file_prefix + "-" + str(species_id),
+                            recursive=True,
+                        )
+
+                if source.name == "ZFIN_ID":
+                    list_files = [list_files[0]]
+
+                for file in list_files:
+                    if source.revision and file == source.revision:
+                        continue
+
+                    dataflow_params["file_name"] = file
+
+                    if re.search(r"^Uniprot", source.name):
+                        hgnc_files = glob.glob(hgnc_path + "/*")
+                        dataflow_params["hgnc_file"] = hgnc_files[0]
+
+                    self.write_output(dataflow_suffix, dataflow_params)
+                    total_sources += 1
+
+        xref_dbi.close()
+
+        if total_sources == 0:
+            with open(f"dataflow_{dataflow_suffix}.json", "a") as fh:
+                fh.write("")
+
+        dataflow_params = {
+            "species_name": species_name,
+            "species_db": core_db_url,
+            "xref_db_url": xref_db_url,
+        }
+        self.write_output(dataflow_sub_suffix, dataflow_params)
diff --git a/src/python/ensembl/production/xrefs/ScheduleSpecies.py b/src/python/ensembl/production/xrefs/ScheduleSpecies.py
new file mode 100644
index 000000000..e63de241a
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/ScheduleSpecies.py
@@ -0,0 +1,178 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Scheduling module to create a pipeline branch for each species in list or division."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class ScheduleSpecies(Base):
+    def run(self):
+        run_all         = self.param_required("run_all", {"type": "bool"})
+        registry        = self.param_required("registry_url", {"type": "str"})
+        ensembl_release = self.param_required("release", {"type": "int"})
+        metasearch_url  = self.param_required("metasearch_url", {"type": "str"})
+        species         = self.param("species", None, {"default": "", "type": "str"})
+        antispecies     = self.param("antispecies", None, {"default": "", "type": "str"})
+        division        = self.param("division", None, {"default": "", "type": "str"})
+        db_prefix       = self.param("db_prefix", None, {"type": "str"})
+        group           = self.param("group", None, {"default": "core", "type": "str"})
+
+        logging.info("ScheduleSpecies starting with parameters:")
+        logging.info(f"\tParam: run_all = {run_all}")
+        logging.info(f"\tParam: registry = {registry}")
+        logging.info(f"\tParam: release = {ensembl_release}")
+        logging.info(f"\tParam: metasearch_url = {metasearch_url}")
+        logging.info(f"\tParam: species = {species}")
+        logging.info(f"\tParam: antispecies = {antispecies}")
+        logging.info(f"\tParam: division = {division}")
+        logging.info(f"\tParam: db_prefix = {db_prefix}")
+        logging.info(f"\tParam: group = {group}")
+
+        if species:
+            species = species.split(",")
+        if antispecies:
+            antispecies = antispecies.split(",")
+        if division:
+            division = division.split(",")
+        ensembl_release = str(ensembl_release)
+
+        # Fix registry url, if needed
+        match = re.search(r"^(.*)://(.*)", registry)
+        if match:
+            registry = match.group(2)
+        match = re.search(r"(.*)/(.*)", registry)
+        if match:
+            registry = match.group(1)
+
+        loaded_dbs = {}
+        dbs = []
+
+        # Construct the db name pattern
+        name_pattern = f"%_{group}%"
+        if db_prefix:
+            db_prefix = f"{db_prefix}_"
+        else:
+            db_prefix = ""
+        name_pattern = f"{db_prefix}{name_pattern}"
+
+        # Getting all dbs
+        if run_all:
+            metasearch_body = {
+                "name_pattern": name_pattern,
+                "filters": [
+                    {"meta_key": "schema_version", "meta_value": ensembl_release},
+                ],
+                "servers": [registry],
+            }
+
+            # Query registry for all core dbs
+            dbs = requests.post(metasearch_url, json=metasearch_body).json()
+            dbs = dbs[registry]
+
+            loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release)
+
+        # Getting dbs for specified species
+        elif species and len(species) > 0:
+            for species_name in species:
+                name_pattern = f"{species_name}_core%"
+                name_pattern = f"{db_prefix}{name_pattern}"
+
+                metasearch_body = {
+                    "name_pattern": name_pattern,
+                    "filters": [
+                        {"meta_key": "schema_version", "meta_value": ensembl_release},
+                    ],
+                    "servers": [registry],
+                }
+
+                # Query registry for species dbs
+                species_dbs = requests.post(metasearch_url, json=metasearch_body).json()
+
+                if len(species_dbs[registry]) < 1:
+                    raise IOError(
+                        f"Database not found for {species_name}, check registry parameters"
+                    )
+                else:
+                    dbs = dbs + species_dbs[registry]
+
+            loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release)
+
+            # Check if all wanted species were found
+            for species_name in species:
+                if not loaded_dbs.get(species_name):
+                    raise IOError(
+                        f"Database not found for {species_name}, check registry parameters"
+                    )
+
+        # Getting dbs for specified divisions
+        elif division and len(division) > 0:
+            for div in division:
+                metasearch_body = {
+                    "name_pattern": name_pattern,
+                    "filters": [
+                        {"meta_key": "schema_version", "meta_value": ensembl_release},
+                        {"meta_key": "species.division", "meta_value": div},
+                    ],
+                    "servers": [registry],
+                }
+
+                # Query registry for dbs in division
+                div_dbs = requests.post(metasearch_url, json=metasearch_body).json()
+                dbs = dbs + div_dbs[registry]
+
+            loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release)
+
+        if len(loaded_dbs) == 0:
+            raise IOError(f"Could not find any matching dbs in registry {registry}")
+
+        if run_all:
+            logging.info(f"All species in {len(loaded_dbs)} databases loaded")
+
+        # Write dataflow output
+        for species_name, db in loaded_dbs.items():
+            if species_name not in antispecies:
+                self.write_output(
+                    "species", {"species_name": species_name, "species_db": db}
+                )
+
+    def check_validity(self, dbs: List(str), prefix: str, group: str, release: str):
+        valid_dbs = {}
+
+        for db in dbs:
+            # Extract db name
+            db_name = db
+            match = re.search(r"(.*)/(.*)", db_name)
+            if match:
+                db_name = match.group(2)
+
+            # Check if db is valid
+            match = re.search(
+                r"^(%s)([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_%s(?:_\d+)?_%s_(\w+)$"
+                % (prefix, group, release),
+                db_name,
+            )
+            if match:
+                species_name = match.group(2)
+                if not valid_dbs.get(species_name):
+                    logging.info(f"Species {species_name} loaded")
+                    valid_dbs[species_name] = db
+                else:
+                    raise IOError(
+                        f"Database {valid_dbs[species_name]} already loaded for species {species_name}, cannot load second database {db}"
+                    )
+            else:
+                logging.info(f"Could not extract species name from database {db}")
+
+        return valid_dbs
diff --git a/src/python/ensembl/production/xrefs/UniParcMapping.py b/src/python/ensembl/production/xrefs/UniParcMapping.py
new file mode 100644
index 000000000..86668b621
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/UniParcMapping.py
@@ -0,0 +1,62 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref module to process the Uniparc mappings."""
+
+from ensembl.production.xrefs.Base import *
+from ensembl.production.xrefs.mappers.UniParcMapper import UniParcMapper
+from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum
+
+
+class UniParcMapping(Base):
+    def run(self):
+        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
+        species_name  = self.param_required("species_name", {"type": "str"})
+        base_path     = self.param_required("base_path", {"type": "str"})
+        release       = self.param_required("release", {"type": "int"})
+        source_db_url = self.param_required("source_db_url", {"type": "str"})
+        registry      = self.param("registry_url", None, {"type": "str"})
+        core_db_url   = self.param("species_db", None, {"type": "str"})
+
+        logging.info(f"UniParcMapping starting for species '{species_name}'")
+
+        if not core_db_url:
+            core_db_url = self.get_db_from_registry(
+                species_name, "core", release, registry
+            )
+
+        # Get species id
+        db_engine = self.get_db_engine(core_db_url)
+        with db_engine.connect() as core_dbi:
+            species_id = self.get_taxon_id(core_dbi)
+
+        # Get the uniparc mapper
+        mapper = UniParcMapper(
+            self.get_xref_mapper(
+                xref_db_url, species_name, base_path, release, core_db_url, registry
+            )
+        )
+
+        # Get source id
+        db_engine = self.get_db_engine(source_db_url)
+        with db_engine.connect() as source_dbi:
+            source_id = self.get_source_id_from_name(source_dbi, "UniParc")
+
+            method = MySQLChecksum({"MAPPER": mapper})
+            results = method.run(
+                mapper.target(), source_id, mapper.object_type(), source_dbi
+            )
+
+        if results:
+            mapper.upload(results, species_id)
diff --git a/src/python/ensembl/production/xrefs/__init__.py b/src/python/ensembl/production/xrefs/__init__.py
new file mode 100644
index 000000000..8dd00df34
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref pipeline modules."""
diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
index e7b0065a4..929450ebf 100644
--- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json
+++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
@@ -6,13 +6,6 @@
       "db" : "core",
       "priority" : 1
     },
-    {
-      "name" : "CCDS",
-      "parser" : "CCDSParser",
-      "file" : "Database",
-      "db" : "ccds",
-      "priority" : 1
-    },
     {
       "name" : "UniParc",
       "parser" : "ChecksumParser",
@@ -88,10 +81,17 @@
       "db" : "core",
       "priority" : 1
     },
+    {
+      "name" : "RFAM",
+      "parser" : "CoreXrefParser",
+      "file" : "script:logic_name=>rfam_12.2_gene,object_type=>gene",
+      "db" : "core",
+      "priority" : 1
+    }
     {
       "name" : "RGD",
       "parser" : "RGDParser",
-      "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt",
+      "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES.RAT.txt",
       "priority" : 2
     },
     {
@@ -110,28 +110,24 @@
     },
     {
       "name" : "RefSeq_dna",
-      "parser" : "RefSeqDatabaseParser",
-      "old_parser" : "RefSeqGPFFParser",
+      "parser" : "RefSeqGPFFParser",
       "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*rna.gbff.gz",
       "method" : "--bestn 5",
       "query_cutoff" : 90,
       "target_cutoff" : 90,
       "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt",
-      "preparse" : 1,
       "priority" : 2,
       "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER",
       "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release*.files.installed"
     },
     {
       "name" : "RefSeq_peptide",
-      "parser" : "RefSeqDatabaseParser",
-      "old_parser" : "RefSeqGPFFParser",
+      "parser" : "RefSeqGPFFParser",
       "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*.protein.gpff.gz",
       "method" : "--bestn 1",
       "query_cutoff" : 100,
       "target_cutoff" : 100,
       "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt",
-      "preparse" : 1,
       "priority" : 3,
       "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER",
       "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release*.files.installed"
@@ -159,25 +155,21 @@
     },
     {
       "name" : "Uniprot/SWISSPROT",
-      "parser" : "UniProtDatabaseParser",
-      "old_parser" : "UniProtParser",
+      "parser" : "UniProtParser",
       "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz",
       "method" : "--bestn 1",
       "query_cutoff" : 100,
       "target_cutoff" : 100,
-      "preparse" : 1,
       "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt",
       "priority" : 1
     },
     {
       "name" : "Uniprot/SPTREMBL",
-      "parser" : "UniProtDatabaseParser",
-      "old_parser" : "UniProtParser",
+      "parser" : "UniProtParser",
       "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_trembl.dat.gz",
       "method" : "--bestn 1",
       "query_cutoff" : 100,
       "target_cutoff" : 100,
-      "preparse" : 1,
       "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt",
       "priority" : 1
     },
@@ -187,35 +179,35 @@
       "file" : "https://ftp.ebi.ac.uk/pub/databases/genenames/vgnc/tsv/vgnc_gene_set_All.txt.gz",
       "priority" : 1
     },
+    {
+      "name" : "ZFIN_desc",
+      "parser" : "ZFINDescParser",
+      "file" : "ftp://zfin.org/pub/transfer/MEOW/zfin_genes.txt",
+      "priority" : 1
+    },
     {
       "name" : "ZFIN_ID",
       "parser" : "ZFINParser",
-      "file" : "https://zfin.org/data_transfer/Downloads/refseq.txt",
+      "file" : "https://zfin.org/data_transfer/Downloads/uniprot.txt",
       "priority" : 3
     },
     {
       "name" : "ZFIN_ID",
       "parser" : "ZFINParser",
-      "file" : "https://zfin.org/data_transfer/Downloads/uniprot.txt",
-      "priority" : 2
+      "file" : "https://zfin.org/data_transfer/Downloads/refseq.txt",
+      "priority" : 3
     },
     {
       "name" : "ZFIN_ID",
       "parser" : "ZFINParser",
       "file" : "https://zfin.org/data_transfer/Downloads/aliases.txt",
-      "priority" : 2
+      "priority" : 3
     },
     {
       "name" : "ZFIN_ID",
       "parser" : "ZFINParser",
-      "file" : "https://zfin.org/data_transfer/Downloads/gene_seq.txt",
-      "priority" : 1
-    },
-    {
-      "name" : "ZFIN_desc",
-      "parser" : "ZFINDescParser",
-      "file" : "ftp://zfin.org/pub/transfer/MEOW/zfin_genes.txt",
-      "priority" : 1
+      "file" : "https://zfin.org/downloads/ensembl_1_to_1.txt",
+      "priority" : 3
     },
     {
       "name" : "cint_jgi_v1",
@@ -245,4 +237,4 @@
       "db" : "ccds",
       "priority" : 3
     }
-]
+]
\ No newline at end of file
diff --git a/src/python/ensembl/production/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini
index 5a4830d52..ca3452245 100644
--- a/src/python/ensembl/production/xrefs/config/xref_config.ini
+++ b/src/python/ensembl/production/xrefs/config/xref_config.ini
@@ -519,8 +519,7 @@ name            = RefSeq_dna
 order           = 15
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqDatabaseParser
-old_parser      = RefSeqGPFFParser
+parser          = RefSeqGPFFParser
 
 [source RefSeq_dna::gencode]
 # Used by human and mouse
@@ -543,8 +542,7 @@ name            = RefSeq_dna
 order           = 15
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqDatabaseParser
-old_parser      = RefSeqGPFFParser
+parser          = RefSeqGPFFParser
 
 [source RefSeq_dna::MULTI-complete]
 # Used by phaeodactylum_tricornutum
@@ -728,8 +726,7 @@ name            = RefSeq_peptide
 order           = 25
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqDatabaseParser
-old_parser      = RefSeqGPFFParser
+parser          = RefSeqGPFFParser
 
 [source SGD_GENE::saccharomyces_cerevisiae]
 # Used by saccharomyces_cerevisiae
@@ -822,8 +819,7 @@ name            = Uniprot/SPTREMBL
 order           = 20
 priority        = 3
 prio_descr      = sequence_mapped
-parser          = UniProtDatabaseParser
-old_parser      = UniProtParser
+parser          = UniProtParser
 dependent_on    = MIM
 
 [source Uniprot/SPTREMBL::gencode]
@@ -851,8 +847,7 @@ name            = Uniprot/SWISSPROT
 order           = 20
 priority        = 3
 prio_descr      = sequence_mapped
-parser          = UniProtDatabaseParser
-old_parser      = UniProtParser
+parser          = UniProtParser
 dependent_on    = MIM
 
 [source Uniprot/SWISSPROT::gencode]
@@ -937,6 +932,13 @@ order           = 20
 priority        = 1
 parser          = UniProtParser
 
+[source UniProt::STRING]
+# Special source used in UniProtParser.  No species uses this source.
+name            = STRING
+order           = 20
+priority        = 1
+parser          = UniProtParser
+
 [source UniParc::MULTI]
 name        = UniParc
 order       = 20
@@ -1004,6 +1006,14 @@ parser          = XenopusJamboreeParser
 name            = ZFIN_ID
 order           = 31
 priority        = 1
+prio_descr      = direct
+parser          = ZFINParser
+
+[source ZFIN_ID::danio_rerio#02]
+# Used by danio_rerio
+name            = ZFIN_ID
+order           = 31
+priority        = 2
 prio_descr      = uniprot/refseq
 parser          = ZFINParser
 
@@ -1488,8 +1498,6 @@ sources         = ZFIN_ID::danio_rerio#01,ZFIN_ID::danio_rerio#03
 taxonomy_id     = 10116
 sources         = RGD::rattus_norvegicus,RGD::rattus_norvegicus#02
 
-
-
 [species ciona_intestinalis]
 taxonomy_id     = 7719
 sources         = cint_jgi_v1::ciona_intestinalis
@@ -1588,7 +1596,7 @@ sources         = PomBase::schizosaccharomyces_pombe
 
 [species plants]
 taxonomy_id = 33090
-sources     = EntrezGene::MULTI,Reactome::MULTI,RNACentral::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_import::otherfeatures,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,UniParc::MULTI,RFAM::MULTI,miRBase::MULTI,ArrayExpress::MULTI,ncRNA_EG::EG,misc_EG::EG
+sources     = EntrezGene::MULTI,Reactome::MULTI,RNACentral::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_peptide::MULTI-Plants,RefSeq_import::otherfeatures,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,UniParc::MULTI,RFAM::MULTI,miRBase::MULTI,ArrayExpress::MULTI,ncRNA_EG::EG,misc_EG::EG
 
 [species glycine_max]
 taxonomy_id     = 3847
@@ -1677,4 +1685,3 @@ sources         = wormbase::tmuris
 [species protist]
 taxonomy_id     = 2759
 sources         = EntrezGene::MULTI,RefSeq_dna::MULTI-complete,RefSeq_peptide::MULTI-complete,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,TRNASCAN_SE::MULTI,RNAMMER::MULTI,ArrayExpress::EG,PHIbase::MULTI,miRBase::MULTI,misc_EG::EG,RFAM::EG
-
diff --git a/src/python/ensembl/production/xrefs/mappers/BasicMapper.py b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py
new file mode 100644
index 000000000..362eea354
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py
@@ -0,0 +1,432 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Base module to handle xref mapping."""
+
+import re
+import os
+import sys
+import warnings
+import logging
+import subprocess
+
+from sqlalchemy import select, insert, update, func, delete, desc, text
+from sqlalchemy.engine import Engine, Connection
+from sqlalchemy.orm import Session, sessionmaker, aliased
+from sqlalchemy.sql.expression import case
+from sqlalchemy.sql import Select
+from typing import Dict, Any, List, Optional, Tuple
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Translation as TranslationORM,
+    Meta as MetaCORM,
+    AltAllele as AltAlleleCORM,
+    t_alt_allele_attrib as AltAlleleAttribORM,
+    ObjectXref as ObjectXrefCORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    UnmappedObject as UnmappedObjectORM,
+    UnmappedReason as UnmappedReasonORM,
+    Analysis as AnalysisORM,
+    OntologyXref as OntologyXrefORM,
+    ExternalSynonym as ExternalSynonymORM,
+    DependentXref as DependentXrefCORM,
+    IdentityXref as IdentityXrefCORM,
+    SeqRegionAttrib as SeqRegionAttribORM,
+    AttribType as AttribTypeORM,
+)
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneTranscriptTranslation as GeneTranscriptTranslationORM,
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    TranslationStableId as TranslationStableIdORM,
+    Meta as MetaUORM,
+    ProcessStatus as ProcessStatusORM,
+    ObjectXref as ObjectXrefUORM,
+    AltAllele as AltAlleleUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    GeneDirectXref as GeneDirectXrefORM,
+    TranscriptDirectXref as TranscriptDirectXrefORM,
+    TranslationDirectXref as TranslationDirectXrefORM,
+    Mapping as MappingORM,
+    MappingJobs as MappingJobsORM,
+    CoordinateXref as CoordinateXrefORM,
+    Synonym as SynonymORM,
+    Pairs as PairsORM,
+    PrimaryXref as PrimaryXrefORM,
+    DisplayXrefPriority as DisplayXrefPriorityORM,
+    GeneDescPriority as GeneDescPriorityORM,
+)
+
+from datetime import datetime
+
+
+class BasicMapper:
+    def __init__(self, args: Dict[str, Any] = None) -> None:
+        if args is None:
+            args = {}
+
+        self._xref = args.get("xref")
+        self._core = args.get("core")
+        self._dna_file = args.get("dna_file")
+        self._protein_file = args.get("protein_file")
+        self._log_file = args.get("log_file")
+        self._species_dir = args.get("species_dir")
+
+    def xref(self, xref_db_engine: Engine = None) -> Engine:
+        """Getter/Setter for the xref DB engine.
+
+        Parameters
+        ----------
+        xref_db_engine: sqlalchemy.engine.Engine, optional
+            The xref DB engine
+
+        Returns
+        -------
+        The xref DB engine.
+        """
+        if xref_db_engine:
+            self._xref = xref_db_engine
+
+        return self._xref
+
+    def core(self, core_db_engine: Engine = None) -> Engine:
+        """Getter/Setter for the core DB engine.
+
+        Parameters
+        ----------
+        core_db_engine: sqlalchemy.engine.Engine, optional
+            The core DB engine
+
+        Returns
+        -------
+        The core DB engine.
+        """
+        if core_db_engine:
+            self._core = core_db_engine
+
+        return self._core
+
+    def dna_file(self, dna_file: str = None) -> str:
+        """Getter/Setter for the dna file.
+
+        Parameters
+        ----------
+        dna_file: str, optional
+            The path to the dna file
+
+        Returns
+        -------
+        The dna file path
+        """
+        if dna_file:
+            self._dna_file = dna_file
+
+        return self._dna_file
+
+    def protein_file(self, protein_file: str = None) -> str:
+        """Getter/Setter for the protein file.
+
+        Parameters
+        ----------
+        protein_file: str, optional
+            The path to the protein file
+
+        Returns
+        -------
+        The protein file path
+        """
+        if protein_file:
+            self._protein_file = protein_file
+
+        return self._protein_file
+
+    def log_file(self, log_file: str = None) -> str:
+        """Getter/Setter for the log file.
+
+        Parameters
+        ----------
+        log_file: str, optional
+            The path to the log file
+
+        Returns
+        -------
+        The log file path
+        """
+        if log_file:
+            self._log_file = log_file
+
+        return self._log_file
+
+    def species_dir(self, species_dir: str = None) -> str:
+        """Getter/Setter for the species directory.
+
+        Parameters
+        ----------
+        species_dir: str, optional
+            The path to the species directory
+
+        Returns
+        -------
+        The species directory
+        """
+        if species_dir:
+            self._species_dir = species_dir
+
+        return self._species_dir
+
+    def official_name(self) -> None:
+        return None
+
+    def add_meta_pair(self, meta_key: str, meta_value: str) -> None:
+        """Adds a row to the meta table.
+
+        Parameters
+        ----------
+        meta_key: str
+            The value of the 'meta_key' column in the meta table
+        meta_value: str
+            The value of the 'meta_value' column in the meta table
+        """
+        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        with self.xref().connect() as dbi:
+            dbi.execute(
+                insert(MetaUORM).values(
+                    meta_key=meta_key, meta_value=meta_value, date=now
+                )
+            )
+
+    def get_meta_value(self, meta_key: str) -> str:
+        """Gets a value from the meta table based on key.
+
+        Parameters
+        ----------
+        meta_key: str
+            The value of the 'meta_key' column in the meta table
+        """
+        with self.xref().connect() as dbi:
+            query = (
+                select(MetaUORM.meta_value)
+                .where(MetaUORM.meta_key == meta_key)
+                .order_by(MetaUORM.meta_id.desc())
+            )
+            value = dbi.execute(query).first()
+
+        if value:
+            value = value[0]
+        return value
+
+    def update_process_status(self, status: str) -> None:
+        """Adds a row to the process_status table.
+
+        Parameters
+        ----------
+        status: str
+            The value of the 'status' column on the process_status table
+        """
+        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        with self.xref().connect() as dbi:
+            dbi.execute(insert(ProcessStatusORM).values(status=status, date=now))
+
+    def set_up_logging(self) -> None:
+        log_file = self.log_file()
+
+        console_handler = logging.StreamHandler()
+        file_handler = logging.FileHandler(log_file, mode="a")
+        console_handler.setLevel(logging.WARNING)
+        file_handler.setLevel(logging.DEBUG)
+
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s | %(levelname)s | %(message)s",
+            datefmt="%d-%b-%Y %H:%M:%S",
+            handlers=[console_handler, file_handler],
+        )
+
+    def log_progress(self, message: str) -> None:
+        logging.info(message)
+
+    def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int:
+        """Retrieves the object_xref row ID from ensembl ID, xref ID, ensembl type, and linkage type.
+
+        Parameters
+        ----------
+        ensembl_id: int
+            The ensEMBL feature internal ID
+        xref_id: int
+            The xref ID related to the object xref
+        ensembl_type: str
+            The feature type (gene, transcript, or translation)
+        linkage_type: str
+            The type of link between the xref and ensEMBL feature
+        master_xref_id: int, optional
+            The xref ID of the xref that this object xref is dependent on
+        status: str, optional
+            The object xref status
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        The object xref ID, if found (else None).
+        """
+        object_xref_id = None
+
+        query = select(ObjectXrefUORM.object_xref_id).where(
+            ObjectXrefUORM.ensembl_id == ensembl_id,
+            ObjectXrefUORM.xref_id == xref_id,
+            ObjectXrefUORM.ensembl_object_type == ensembl_type,
+            ObjectXrefUORM.linkage_type == linkage_type,
+        )
+        if master_xref_id is not None:
+            query = query.where(ObjectXrefUORM.master_xref_id == master_xref_id)
+        if status is not None:
+            query = query.where(ObjectXrefUORM.ox_status == status)
+
+        result = dbi.execute(query).fetchall()
+
+        if result:
+            object_xref_id = result[0][0]
+
+        return object_xref_id
+
+    def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int:
+        """Adds data into object xref table in a database.
+
+        Parameters
+        ----------
+        ensembl_id: int
+            The ensEMBL feature internal ID
+        xref_id: int
+            The xref ID related to the object xref
+        ensembl_type: str
+            The feature type (gene, transcript, or translation)
+        linkage_type: str
+            The type of link between the xref and ensEMBL feature
+        master_xref_id: int, optional
+            The xref ID of the xref that this object xref is dependent on
+        status: str, optional
+            The object xref status
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        The inserted object xref ID.
+        """
+        query = insert(ObjectXrefUORM).values(
+            ensembl_id=ensembl_id,
+            xref_id=xref_id,
+            ensembl_object_type=ensembl_type,
+            linkage_type=linkage_type,
+        )
+        if master_xref_id is not None:
+            query = query.values(master_xref_id=master_xref_id)
+        if status is not None:
+            query = query.values(ox_status=status)
+        dbi.execute(query)
+
+        object_xref_id = self.get_object_xref_id(
+            ensembl_id, xref_id, ensembl_type, linkage_type, dbi, master_xref_id, status
+        )
+        return object_xref_id
+
+    def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> None:
+        logging.info(
+            f"{db_name} is associated with both {type1} and {type2} object types. Fixing."
+        )
+
+        # Figure out where to move xref to
+        to_type, from_type, to_id, from_id = None, None, None, None
+        if type1 == "Gene" or type2 == "Gene":
+            to_type = "Gene"
+
+            if type1 == "Translation" or type2 == "Translation":
+                from_type = "Translation"
+            else:
+                from_type = "Transcript"
+        else:
+            to_type = "Transcript"
+            from_type = "Translation"
+
+        logging.info(f"Moving all associations from {from_type} to {to_type}")
+
+        to_id = getattr(GeneTranscriptTranslationORM, to_type.lower() + "_id")
+        from_id = getattr(GeneTranscriptTranslationORM, from_type.lower() + "_id")
+
+        # Move the object xref
+        query = (
+            update(ObjectXrefUORM)
+            .values(ensembl_object_type=to_type, ensembl_id=to_id)
+            .where(
+                ObjectXrefUORM.ensembl_object_type == from_type,
+                ObjectXrefUORM.ensembl_id == from_id,
+                XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                SourceUORM.name == db_name,
+            )
+            .prefix_with("IGNORE")
+        )
+        dbi.execute(query)
+
+        # Delete moved object xref
+        query = (
+            select(ObjectXrefUORM.object_xref_id)
+            .outerjoin(
+                IdentityXrefUORM,
+                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+            )
+            .where(
+                ObjectXrefUORM.ensembl_object_type == from_type,
+                XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                SourceUORM.name == db_name,
+            )
+        )
+        for row in dbi.execute(query).mappings().all():
+            dbi.execute(
+                delete(ObjectXrefUORM).where(
+                    ObjectXrefUORM.object_xref_id == row.object_xref_id
+                )
+            )
+            dbi.execute(
+                delete(IdentityXrefUORM).where(
+                    IdentityXrefUORM.object_xref_id == row.object_xref_id
+                )
+            )
+
+        # Delete dependent xref
+        sub_query = select(ObjectXrefUORM.object_xref_id)
+        query = delete(DependentXrefUORM).where(
+            DependentXrefUORM.object_xref_id.not_in(sub_query)
+        )
+        dbi.execute(query)
+
+    def update_object_xref_status(self, object_xref_id: int, status: str, dbi: Connection) -> None:
+        query = (
+            update(ObjectXrefUORM)
+            .where(ObjectXrefUORM.object_xref_id == object_xref_id)
+            .values(ox_status=status)
+        )
+        dbi.execute(query)
diff --git a/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py
new file mode 100644
index 000000000..535bb7ad6
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py
@@ -0,0 +1,111 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing Checksum xref data."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class ChecksumMapper(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        self.mapper(mapper)
+        mapper.set_up_logging()
+
+    def target(self) -> None:
+        return None
+
+    def mapper(self, mapper: BasicMapper = None):
+        if mapper:
+            self._mapper = mapper
+
+        return self._mapper
+
+    def upload(self, results: List[Dict[str, Any]], species_id: int) -> None:
+        if not species_id:
+            logging.info("No species_id found, doing nothing")
+            return
+
+        source_id = self.source_id()
+
+        logging.info("Deleting records from previous possible upload runs")
+        with self.xref().connect() as xref_dbi:
+            self._delete_entries("object_xref", source_id, xref_dbi)
+            self._delete_entries("xref", source_id, xref_dbi)
+
+        # Start session, in order to get inserted IDs
+        Session = sessionmaker(self.xref())
+        with Session.begin() as session:
+            logging.info("Starting xref insertion")
+
+            # Record UPIs to make sure we do not attempt to insert duplicate UPIs
+            upi_xref_id = {}
+            for row in results:
+                upi = row["upi"]
+                if upi_xref_id.get(upi):
+                    row["xref_id"] = upi_xref_id[upi]
+                else:
+                    xref_object = XrefUORM(
+                        source_id=source_id,
+                        accession=upi,
+                        label=upi,
+                        version=1,
+                        species_id=species_id,
+                        info_type="CHECKSUM",
+                    )
+                    session.add(xref_object)
+                    session.flush()
+                    row["xref_id"] = xref_object.xref_id
+                    upi_xref_id[upi] = xref_object.xref_id
+
+            logging.info("Starting object_xref insertion")
+            for row in results:
+                object_xref_object = ObjectXrefUORM(
+                    ensembl_id=row["id"],
+                    ensembl_object_type=row["object_type"],
+                    xref_id=row["xref_id"],
+                    linkage_type="CHECKSUM",
+                    ox_status="DUMP_OUT",
+                )
+                session.add(object_xref_object)
+
+        logging.info("Finished insertions")
+
+    def source_id(self) -> int:
+        source_name = self.external_db_name()
+
+        with self.xref().connect() as dbi:
+            source_id = dbi.execute(
+                select(SourceUORM.source_id).where(SourceUORM.name == source_name)
+            ).scalar()
+
+        return int(source_id)
+
+    def _delete_entries(self, table: str, source_id: int, dbi: Connection) -> None:
+        if table == "xref":
+            query = delete(XrefUORM).where(XrefUORM.source_id == source_id)
+        elif table == "object_xref":
+            query = delete(ObjectXrefUORM).where(
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                XrefUORM.source_id == source_id,
+            )
+        else:
+            raise AttributeError(
+                f"Invalid table to delete: {table}. Can either be 'xref' or 'object_xref'."
+            )
+
+        count = dbi.execute(query).rowcount
+
+        logging.info(f"Deleted {count} entries from '{table}' table")
diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
new file mode 100644
index 000000000..d938d966c
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
@@ -0,0 +1,130 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing coordinate xref data."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+from ensembl.common.RangeRegistry import RangeRegistry
+
+coding_weight = 2
+ens_weight = 3
+transcript_score_threshold = 0.75
+
+
+class CoordinateMapper(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        self.species_dir(mapper.species_dir())
+        mapper.set_up_logging()
+
+    def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir: str) -> None:
+        self.update_process_status("coordinate_xrefs_started")
+
+        # We only do coordinate mapping for mouse and human for now
+        if species_name != "mus_musculus" and species_name != "homo_sapiens":
+            self.update_process_status("coordinate_xref_finished")
+            return
+
+        output_dir = self.species_dir()
+        xref_filename = os.path.join(output_dir, "xref_coord.txt")
+        object_xref_filename = os.path.join(output_dir, "object_xref_coord.txt")
+        unmapped_reason_filename = os.path.join(output_dir, "unmapped_reason_coord.txt")
+        unmapped_object_filename = os.path.join(output_dir, "unmapped_object_coord.txt")
+
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        # Figure out the last used IDs in the core DB
+        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
+        object_xref_id = core_dbi.execute(
+            select(func.max(ObjectXrefCORM.object_xref_id))
+        ).scalar()
+        unmapped_object_id = core_dbi.execute(
+            select(func.max(UnmappedObjectORM.unmapped_object_id))
+        ).scalar()
+        unmapped_reason_id = core_dbi.execute(
+            select(func.max(UnmappedReasonORM.unmapped_reason_id))
+        ).scalar()
+
+        logging.info(
+            f"Last used xref_id={xref_id}, object_xref_id={object_xref_id}, unmapped_object_id={unmapped_object_id}, unmapped_reason_id={unmapped_reason_id}"
+        )
+
+        # Get an analysis ID
+        analysis_params = f"weights(coding,ensembl)={coding_weight:.2f},{ens_weight:.2f};transcript_score_threshold={transcript_score_threshold:.2f}"
+        analysis_id = core_dbi.execute(
+            select(AnalysisORM.analysis_id).where(
+                AnalysisORM.logic_name == "xrefcoordinatemapping",
+                AnalysisORM.parameters == analysis_params,
+            )
+        ).scalar()
+
+        if not analysis_id:
+            analysis_id = core_dbi.execute(
+                select(AnalysisORM.analysis_id).where(
+                    AnalysisORM.logic_name == "xrefcoordinatemapping"
+                )
+            ).scalar()
+
+            if analysis_id:
+                logging.info("Will update 'analysis' table with new parameter settings")
+
+                # Update an existing analysis
+                now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                core_dbi.execute(
+                    update(AnalysisORM)
+                    .where(AnalysisORM.analysis_id == analysis_id)
+                    .values(created=now, parameters=analysis_params)
+                )
+            else:
+                logging.info(
+                    f"Cannot find analysis ID for this analysis: logic_name = 'xrefcoordinatemapping' parameters = {analysis_params}"
+                )
+
+                # Store a new analysis
+                logging.info("A new analysis will be added")
+
+                analysis_id = core_dbi.execute(
+                    select(func.max(AnalysisORM.analysis_id))
+                ).scalar()
+                logging.info(f"Last used analysis_id is {analysis_id}")
+
+                analysis_id += 1
+                now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                core_dbi.execute(
+                    insert(AnalysisORM).values(
+                        analysis_id=analysis_id,
+                        created=now,
+                        logic_name="xrefcoordinatemapping",
+                        program="CoordinateMapper.pm",
+                        parameters=analysis_params,
+                        module="CoordinateMapper.pm",
+                    )
+                )
+
+        if analysis_id:
+            logging.info(f"Analysis ID is {analysis_id}")
+
+        logging.info(f"Running perl script {scripts_dir}/coordinmate_mapper.pl")
+        perl_cmd = f"perl {scripts_dir}/coordinmate_mapper.pl --xref_db_url '{self.xref()}' --core_db_url '{self.core()}' --species_id {species_id} --output_dir '{output_dir}' --analysis_id {analysis_id}"
+        cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE)
+
+        self.update_process_status("coordinate_xref_finished")
+
+        self.biomart_fix("UCSC", "Translation", "Gene", xref_dbi)
+        self.biomart_fix("UCSC", "Transcript", "Gene", xref_dbi)
+
+        xref_dbi.close()
+        core_dbi.close()
diff --git a/src/python/ensembl/production/xrefs/mappers/CoreInfo.py b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py
new file mode 100644
index 000000000..eff41f4a2
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py
@@ -0,0 +1,320 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for loading core data into an xref database."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class CoreInfo(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def get_core_data(self) -> None:
+        # Load table gene_transcript_translation
+        self.load_gene_transcript_translation()
+
+        # Load tables xxx_stable_id
+        self.load_stable_ids()
+
+        self.update_process_status("core_data_loaded")
+
+    def load_gene_transcript_translation(self) -> None:
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        query = select(
+            TranscriptORM.gene_id,
+            TranscriptORM.transcript_id,
+            TranslationORM.translation_id,
+        ).outerjoin(
+            TranslationORM, TranscriptORM.transcript_id == TranslationORM.transcript_id
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            xref_dbi.execute(
+                insert(GeneTranscriptTranslationORM)
+                .values(
+                    gene_id=row.gene_id,
+                    transcript_id=row.transcript_id,
+                    translation_id=row.translation_id,
+                )
+                .prefix_with("IGNORE")
+            )
+
+        xref_dbi.close()
+        core_dbi.close()
+
+    def load_stable_ids(self) -> None:
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        core_tables = {
+            "gene": GeneORM,
+            "transcript": TranscriptORM,
+            "translation": TranslationORM,
+        }
+        xref_tables = {
+            "gene": GeneStableIdORM,
+            "transcript": TranscriptStableIdORM,
+            "translation": TranslationStableIdORM,
+        }
+
+        for table in ["gene", "transcript", "translation"]:
+            column = getattr(core_tables[table], f"{table}_id")
+            core_query = select(
+                column.label("internal_id"), core_tables[table].stable_id
+            )
+            if table == "transcript":
+                core_query = core_query.add_columns(TranscriptORM.biotype)
+
+            count = 0
+            for row in core_dbi.execute(core_query).mappings().all():
+                xref_query = (
+                    insert(xref_tables[table])
+                    .values(internal_id=row.internal_id, stable_id=row.stable_id)
+                    .prefix_with("IGNORE")
+                )
+                if table == "transcript":
+                    xref_query = xref_query.values(biotype=row.biotype)
+                xref_dbi.execute(xref_query)
+
+                count += 1
+
+            logging.info(f"{count} {table}s loaded from core DB")
+
+        xref_dbi.close()
+        core_dbi.close()
+
+    def get_alt_alleles(self) -> None:
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        alt_allele_list = self.fetch_all_alt_alleles(core_dbi)
+
+        count = len(alt_allele_list)
+        alt_id_to_gene_id, gene_id_to_alt_id, is_reference = {}, {}, {}
+        max_alt_id = 0
+
+        if count > 0:
+            xref_dbi.execute(delete(AltAlleleUORM))
+
+            alt_added, num_of_genes = 0, 0
+
+            # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table
+            # Track the reference gene IDs
+            for group_id, group_members in alt_allele_list.items():
+                ref_gene = self.rep_gene_id(group_members)
+
+                # Representative gene not guaranteed, try to find an alternative best fit
+                if not ref_gene:
+                    logging.info("Get alternative reference gene")
+                    for gene_id in self.get_all_genes(group_members):
+                        query = select(AttribTypeORM.code).where(
+                            SeqRegionAttribORM.seq_region_id == GeneORM.seq_region_id,
+                            AttribTypeORM.attrib_type_id
+                            == SeqRegionAttribORM.attrib_type_id,
+                            GeneORM.gene_id == gene_id,
+                            AttribTypeORM.code == "non_ref",
+                        )
+                        result = core_dbi.execute(query)
+                        if result.rowcount > 0:
+                            continue
+                        else:
+                            ref_gene = gene_id
+                            break
+
+                if not ref_gene:
+                    logging.warning(
+                        f"Tried very hard but failed to select a representative gene for alt-allele-group {group_id}"
+                    )
+                    continue
+
+                is_reference[ref_gene] = 1
+                others = []
+                for member in group_members:
+                    if member[0] != ref_gene:
+                        others.append(member[0])
+
+                xref_dbi.execute(
+                    insert(AltAlleleUORM).values(
+                        alt_allele_id=group_id, gene_id=ref_gene, is_reference=1
+                    )
+                )
+                num_of_genes += 1
+                alt_added += 1
+                for gene_id in others:
+                    xref_dbi.execute(
+                        insert(AltAlleleUORM).values(
+                            alt_allele_id=group_id, gene_id=gene_id, is_reference=0
+                        )
+                    )
+                    num_of_genes += 1
+
+                if group_id > max_alt_id:
+                    max_alt_id = group_id
+
+            logging.info(f"{alt_added} alleles found containing {num_of_genes} genes")
+        else:
+            logging.info("No alt alleles found for this species")
+
+        # LRGs added as alt_alleles in the XREF system but never added to core
+        count = 0
+        old_count, new_count, lrg_count = 0, 0, 0
+
+        query = (
+            select(ObjectXrefCORM.ensembl_id, GeneORM.gene_id)
+            .where(
+                XrefCORM.xref_id == ObjectXrefCORM.xref_id,
+                ExternalDbORM.external_db_id == XrefCORM.external_db_id,
+                ObjectXrefCORM.ensembl_object_type == "Gene",
+                XrefCORM.display_label == GeneORM.stable_id,
+            )
+            .filter(ExternalDbORM.db_name.like("Ens_Hs_gene"))
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only
+            # Else use a new one and add both core and LRG
+            group_id = self.fetch_group_id_by_gene_id(row.gene_id, core_dbi)
+            if group_id:
+                xref_dbi.execute(
+                    insert(AltAlleleUORM).values(
+                        alt_allele_id=group_id, gene_id=row.ensembl_id, is_reference=0
+                    )
+                )
+                old_count += 1
+            else:
+                group_id = self.fetch_group_id_by_gene_id(row.ensembl_id, core_dbi)
+                if group_id:
+                    xref_dbi.execute(
+                        insert(AltAlleleUORM).values(
+                            alt_allele_id=group_id,
+                            gene_id=row.ensembl_id,
+                            is_reference=1,
+                        )
+                    )
+                    lrg_count += 1
+                    logging.info(f"LRG peculiarity\t{row.gene_id}\t{row.ensembl_id}")
+                else:
+                    max_alt_id += 1
+                    xref_dbi.execute(
+                        insert(AltAlleleUORM).values(
+                            alt_allele_id=max_alt_id,
+                            gene_id=row.ensembl_id,
+                            is_reference=0,
+                        )
+                    )
+                    xref_dbi.execute(
+                        insert(AltAlleleUORM).values(
+                            alt_allele_id=max_alt_id,
+                            gene_id=row.gene_id,
+                            is_reference=1,
+                        )
+                    )
+                    new_count += 1
+            count += 1
+
+        if count:
+            logging.info(
+                f"Added {count} alt_alleles for the LRGs. {old_count} added to previous alt_alleles and {new_count} new ones"
+            )
+            logging.info(f"LRG problem count = {lrg_count}")
+
+        xref_dbi.close()
+        core_dbi.close()
+
+        self.update_process_status("alt_alleles_added")
+
+    def fetch_all_alt_alleles(self, dbi: Connection) -> Dict[int, List[List[Any]]]:
+        group_list = {}
+        query = None
+
+        if self.is_multispecies(dbi):  ##### TO DO: handle multiespecies
+            raise NotImplementedError(f"Pipeline cannot handle multispecies DBs yet")
+
+        query = select(AltAlleleCORM.alt_allele_group_id).distinct()
+
+        for row in dbi.execute(query).mappings().all():
+            group_members = self.fetch_members_by_group_id(row.alt_allele_group_id, dbi)
+            group_list[row.alt_allele_group_id] = group_members
+
+        return group_list
+
+    def fetch_members_by_group_id(self, group_id: int, dbi: Connection) -> List[List[Any]]:
+        members = []
+
+        query = (
+            select(AltAlleleCORM.alt_allele_id, AltAlleleCORM.gene_id)
+            .where(AltAlleleCORM.alt_allele_group_id == group_id)
+            .order_by(AltAlleleCORM.alt_allele_id)
+        )
+        for row in dbi.execute(query).mappings().all():
+            # Fetch alt_allele attributes
+            attrib_list = {}
+            query = select(AltAlleleAttribORM.columns.attrib).where(
+                AltAlleleAttribORM.columns.alt_allele_id == row.alt_allele_id
+            )
+            for attrib_row in dbi.execute(query).mappings().all():
+                attrib_list[attrib_row.attrib] = 1
+
+            members.append([row.gene_id, attrib_list])
+
+        return members
+
+    def fetch_group_id_by_gene_id(self, gene_id: int, dbi: Connection) -> Optional[int]:
+        query = (
+            select(AltAlleleCORM.alt_allele_group_id)
+            .where(AltAlleleCORM.gene_id == gene_id)
+            .order_by(AltAlleleCORM.alt_allele_group_id)
+        )
+        group_list = dbi.execute(query).mappings().all()
+
+        if len(group_list) > 0:
+            return group_list[0].alt_allele_group_id
+
+        return None
+
+    def is_multispecies(self, dbi: Connection) -> bool:
+        result = dbi.execute(
+            select(MetaCORM.meta_value).where(
+                MetaCORM.meta_key == "species.taxonomy_id"
+            )
+        )
+
+        if result.rowcount > 1:
+            return True
+        else:
+            return False
+
+    def rep_gene_id(self, group: List[List[Any]]) -> Optional[int]:
+        for allele in group:
+            gene_id = allele[0]
+            allele_type = allele[1]
+
+            if allele_type.get("IS_REPRESENTATIVE"):
+                return gene_id
+
+        logging.warning(
+            "No representative allele currently set for this AltAlleleGroup"
+        )
+        return None
+
+    def get_all_genes(self, group: List[List[Any]]) -> List[int]:
+        gene_ids = []
+
+        for allele in group:
+            gene_ids.append(allele[0])
+
+        return sorted(gene_ids)
diff --git a/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py
new file mode 100644
index 000000000..c3113dee3
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py
@@ -0,0 +1,182 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing direct xref data."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class DirectXrefsMapper(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def process(self) -> None:
+        xref_dbi = self.xref().connect()
+
+        db_tables = {
+            "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM},
+            "transcript": {
+                "direct": TranscriptDirectXrefORM,
+                "stable_id": TranscriptStableIdORM,
+            },
+            "translation": {
+                "direct": TranslationDirectXrefORM,
+                "stable_id": TranslationStableIdORM,
+            },
+        }
+
+        err_count = {}
+        object_xref_id = 0
+
+        for table in ["gene", "transcript", "translation"]:
+            direct_table = db_tables[table]["direct"]
+            stable_id_table = db_tables[table]["stable_id"]
+
+            count, duplicate_direct_count, duplicate_dependent_count = 0, 0, 0
+
+            # Get the direct xrefs
+            stable_id_query = (
+                select(
+                    SourceUORM.name,
+                    direct_table.general_xref_id,
+                    stable_id_table.internal_id,
+                    direct_table.ensembl_stable_id,
+                )
+                .outerjoin(
+                    stable_id_table,
+                    stable_id_table.stable_id == direct_table.ensembl_stable_id,
+                )
+                .where(
+                    XrefUORM.xref_id == direct_table.general_xref_id,
+                    XrefUORM.source_id == SourceUORM.source_id,
+                )
+            )
+            for row in xref_dbi.execute(stable_id_query).mappings().all():
+                dbname = row.name
+                xref_id = row.general_xref_id
+                internal_id = row.internal_id
+                stable_id = row.ensembl_stable_id
+
+                # Check if internal id exists. If not, it is an internal id already or stable_id no longer exists
+                if internal_id is None:
+                    if re.search(r"^\d+$", stable_id):
+                        internal_id = stable_id
+                    else:
+                        err_count[dbname] = err_count.get(dbname, 0) + 1
+                        continue
+
+                object_xref_id += 1
+                count += 1
+                master_xref_ids = []
+
+                if internal_id == 0:
+                    raise LookupError(
+                        f"Problem: could not find stable id {stable_id} and got past the first check for {dbname}"
+                    )
+
+                # Insert into object xref table
+                object_xref_id = self.get_object_xref_id(
+                    internal_id, xref_id, table, "DIRECT", xref_dbi
+                )
+                if object_xref_id:
+                    duplicate_direct_count += 1
+                    continue
+                else:
+                    object_xref_id = self.add_object_xref(
+                        internal_id, xref_id, table, "DIRECT", xref_dbi
+                    )
+
+                # Insert into identity xref table
+                xref_dbi.execute(
+                    insert(IdentityXrefUORM).values(
+                        object_xref_id=object_xref_id,
+                        query_identity=100,
+                        target_identity=100,
+                    )
+                )
+                master_xref_ids.append(xref_id)
+
+                duplicate_dependent_count += self.process_dependents(
+                    {
+                        "master_xrefs": master_xref_ids,
+                        "dup_count": duplicate_dependent_count,
+                        "table": table,
+                        "internal_id": internal_id,
+                    },
+                    xref_dbi,
+                )
+
+            if duplicate_direct_count or duplicate_dependent_count:
+                logging.info(
+                    f"Duplicate entries ignored for {duplicate_direct_count} direct xrefs and  {duplicate_dependent_count} dependent xrefs"
+                )
+
+        for key, val in err_count.items():
+            logging.warning(
+                f"{val} direct xrefs for database {key} could not be added as their stable_ids could not be found"
+            )
+
+        xref_dbi.close()
+
+        self.update_process_status("direct_xrefs_parsed")
+
+    def process_dependents(self, args: Dict[str, Any], dbi: Connection) -> int:
+        master_xref_ids = args["master_xrefs"]
+        duplicate_dep_count = args["dup_count"]
+        table = args["table"]
+        internal_id = args["internal_id"]
+
+        for master_xref_id in master_xref_ids:
+            # Get all dependents related to master xref
+            dep_query = select(DependentXrefUORM.dependent_xref_id).where(
+                DependentXrefUORM.master_xref_id == master_xref_id
+            )
+            for dep in dbi.execute(dep_query).mappings().all():
+                # Add dependent object xref
+                dep_object_xref_id = self.get_object_xref_id(
+                    internal_id,
+                    dep.dependent_xref_id,
+                    table,
+                    "DEPENDENT",
+                    dbi,
+                    master_xref_id,
+                )
+                if dep_object_xref_id:
+                    duplicate_dep_count += 1
+                    continue
+                else:
+                    dep_object_xref_id = self.add_object_xref(
+                        internal_id,
+                        dep.dependent_xref_id,
+                        table,
+                        "DEPENDENT",
+                        dbi,
+                        master_xref_id,
+                    )
+
+                # Add identity xref
+                dbi.execute(
+                    insert(IdentityXrefUORM).values(
+                        object_xref_id=dep_object_xref_id,
+                        query_identity=100,
+                        target_identity=100,
+                    )
+                )
+
+                # Get the dependent dependents just in case
+                master_xref_ids.append(dep.dependent_xref_id)
+
+        return duplicate_dep_count
diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
new file mode 100644
index 000000000..a2a543589
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
@@ -0,0 +1,871 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for setting display xrefs in the core DB."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class DisplayXrefs(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        self.mapper(mapper)
+        mapper.set_up_logging()
+
+    def mapper(self, mapper: BasicMapper = None) -> BasicMapper:
+        if mapper:
+            self._mapper = mapper
+
+        return self._mapper
+
+    def build_display_xrefs(self) -> None:
+        logging.info("Processing display xrefs")
+
+        mapper = self.mapper()
+
+        # Set the display xrefs
+        if hasattr(mapper, "set_display_xrefs"):
+            mapper.set_display_xrefs()
+        else:
+            set_transcript_display_xrefs = False
+            if hasattr(mapper, "set_transcript_names"):
+                set_transcript_display_xrefs = True
+            self.set_display_xrefs(set_transcript_display_xrefs)
+
+        # Set transcript names
+        if hasattr(mapper, "set_transcript_names"):
+            mapper.set_transcript_names()
+        else:
+            self.set_transcript_names()
+
+        self.update_process_status("display_xrefs_done")
+
+        # Set the gene descriptions
+        self.set_gene_descriptions()
+
+        # Set the meta timestamp
+        self.set_meta_timestamp()
+
+        self.update_process_status("gene_descriptions_done")
+
+    def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
+        logging.info("Setting Transcript and Gene display xrefs")
+
+        # Get the xref offset used when adding the xrefs into the core DB
+        xref_offset = self.get_meta_value("xref_offset")
+        xref_offset = int(xref_offset)
+        logging.info(f"Using xref offset of {xref_offset}")
+
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+        mapper = self.mapper()
+
+        # Reset transcript display xrefs
+        if set_transcript_display_xrefs:
+            core_dbi.execute(
+                update(TranscriptORM)
+                .values(display_xref_id=None)
+                .where(TranslationORM.biotype != "LRG_gene")
+            )
+
+        for object_type in ["Gene", "Transcript"]:
+            if object_type == "Transcript" and not set_transcript_display_xrefs:
+                continue
+            precedence_list, ignore = None, None
+
+            # Get name source priorities and ignore queries
+            method = f"{object_type.lower()}_display_xref_sources"
+            if hasattr(mapper, method):
+                precedence_list, ignore = getattr(mapper, method)()
+            else:
+                precedence_list, ignore = getattr(self, method)()
+
+            # Add the priorities into the DB
+            priority = 0
+            logging.info(f"Precedence for {object_type} display xrefs (1- best name)")
+
+            for source_name in precedence_list:
+                priority += 1
+
+                # Get the source ID
+                query = (
+                    select(SourceUORM.source_id, SourceUORM.name)
+                    .where(SourceUORM.name.like(source_name))
+                    .order_by(SourceUORM.priority)
+                )
+                for row in xref_dbi.execute(query).mappings().all():
+                    xref_dbi.execute(
+                        insert(DisplayXrefPriorityORM).values(
+                            ensembl_object_type=object_type,
+                            source_id=row.source_id,
+                            priority=priority,
+                        )
+                    )
+
+                logging.info(f"{priority} - {row.name}")
+
+            # Execute ignore queries
+            self._apply_ignore(ignore, xref_dbi)
+
+            object_seen = {}
+            display_xref_count = 0
+
+            # Build the case statements
+            GTTGene = aliased(GeneTranscriptTranslationORM)
+            GTTTranscript = aliased(GeneTranscriptTranslationORM)
+            GTTTranslation = aliased(GeneTranscriptTranslationORM)
+            gene_case_stmt = case(
+                [
+                    (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id),
+                    (
+                        ObjectXrefUORM.ensembl_object_type == "Transcript",
+                        GTTTranscript.gene_id,
+                    ),
+                    (
+                        ObjectXrefUORM.ensembl_object_type == "Translation",
+                        GTTTranslation.gene_id,
+                    ),
+                ],
+            ).label("d_gene_id")
+            transcript_case_stmt = case(
+                [
+                    (
+                        ObjectXrefUORM.ensembl_object_type == "Gene",
+                        GTTGene.transcript_id,
+                    ),
+                    (
+                        ObjectXrefUORM.ensembl_object_type == "Transcript",
+                        GTTTranscript.transcript_id,
+                    ),
+                    (
+                        ObjectXrefUORM.ensembl_object_type == "Translation",
+                        GTTTranslation.transcript_id,
+                    ),
+                ],
+            ).label("d_transcript_id")
+
+            # Get all relevent xrefs for this object type based on precendence sources
+            query = (
+                select(
+                    gene_case_stmt,
+                    transcript_case_stmt,
+                    DisplayXrefPriorityORM.priority,
+                    XrefUORM.xref_id,
+                )
+                .join(
+                    SourceUORM, SourceUORM.source_id == DisplayXrefPriorityORM.source_id
+                )
+                .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
+                .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id)
+                .join(
+                    IdentityXrefUORM,
+                    IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+                )
+                .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id)
+                .outerjoin(
+                    GTTTranscript,
+                    GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id,
+                )
+                .outerjoin(
+                    GTTTranslation,
+                    GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id,
+                )
+                .where(
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                    DisplayXrefPriorityORM.ensembl_object_type == object_type,
+                )
+                .order_by(
+                    "d_gene_id",
+                    ObjectXrefUORM.ensembl_object_type,
+                    DisplayXrefPriorityORM.priority,
+                    desc(
+                        IdentityXrefUORM.target_identity
+                        + IdentityXrefUORM.query_identity
+                    ),
+                    ObjectXrefUORM.unused_priority.desc(),
+                    XrefUORM.accession,
+                )
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                object_id = None
+                if object_type == "Gene":
+                    object_id = row.d_gene_id
+                elif object_type == "Transcript":
+                    object_id = row.d_transcript_id
+
+                # Update the display xrefs
+                if not object_seen.get(object_id):
+                    xref_id = int(row.xref_id)
+                    if object_type == "Gene":
+                        core_dbi.execute(
+                            update(GeneORM)
+                            .values(display_xref_id=xref_id + xref_offset)
+                            .where(
+                                GeneORM.gene_id == object_id,
+                                GeneORM.display_xref_id == None,
+                            )
+                        )
+                    elif object_type == "Transcript":
+                        core_dbi.execute(
+                            update(TranscriptORM)
+                            .values(display_xref_id=xref_id + xref_offset)
+                            .where(TranscriptORM.transcript_id == object_id)
+                        )
+
+                    display_xref_count += 1
+                    object_seen[object_id] = 1
+
+            logging.info(f"Updated {display_xref_count} {object_type} display_xrefs")
+
+        # Reset ignored object xrefs
+        xref_dbi.execute(
+            update(ObjectXrefUORM)
+            .values(ox_status="DUMP_OUT")
+            .where(ObjectXrefUORM.ox_status == "NO_DISPLAY")
+        )
+
+        # Remove synonyms not linked to display xrefs
+        query = (
+            select(XrefCORM.xref_id)
+            .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id)
+            .where(GeneORM.display_xref_id == None)
+        )
+        result = core_dbi.execute(query).fetchall()
+        xref_ids = [row[0] for row in result]
+
+        core_dbi.execute(
+            delete(ExternalSynonymORM).where(ExternalSynonymORM.xref_id.in_(xref_ids))
+        )
+
+        xref_dbi.close()
+        core_dbi.close()
+
+    def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "VGNC",
+            "HGNC",
+            "MGI",
+            "RGD",
+            "ZFIN_ID",
+            "Xenbase",
+            "RFAM",
+            "miRBase",
+            "EntrezGene",
+            "Uniprot_gn",
+        ]
+        ignore_queries = {}
+
+        # Ignore EntrezGene labels dependent on predicted RefSeqs
+        MasterXref = aliased(XrefUORM)
+        DependentXref = aliased(XrefUORM)
+        MasterSource = aliased(SourceUORM)
+        DependentSource = aliased(SourceUORM)
+
+        query = select(ObjectXrefUORM.object_xref_id.distinct()).where(
+            ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
+            ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id,
+            DependentXrefUORM.dependent_xref_id == DependentXref.xref_id,
+            DependentXrefUORM.master_xref_id == MasterXref.xref_id,
+            MasterXref.source_id == MasterSource.source_id,
+            DependentXref.source_id == DependentSource.source_id,
+            MasterSource.name.like("Refseq%predicted"),
+            DependentSource.name.like("EntrezGene"),
+            ObjectXrefUORM.ox_status == "DUMP_OUT",
+        )
+        ignore_queries["EntrezGene"] = query
+
+        query = (
+            select(ObjectXrefUORM.object_xref_id)
+            .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+            .join(SourceUORM, SourceUORM.source_id == XrefUORM.source_id)
+            .where(
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                XrefUORM.label.regexp_match("^LOC[[:digit:]]+"),
+            )
+        )
+        ignore_queries["LOC_prefix"] = query
+
+        return sources_list, ignore_queries
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        return self.gene_display_xref_sources()
+
+    def _apply_ignore(self, ignore_queries: Dict[str, Select], dbi: Connection) -> None:
+        # Set status to NO_DISPLAY for object_xrefs with a display_label that is just numeric
+        query = (
+            update(ObjectXrefUORM)
+            .values(ox_status="NO_DISPLAY")
+            .where(
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+                ObjectXrefUORM.ox_status.like("DUMP_OUT"),
+                XrefUORM.label.regexp_match("^[0-9]+$"),
+            )
+        )
+        dbi.execute(query)
+
+        # Go through ignore queries
+        for ignore_type, ignore_query in ignore_queries.items():
+            # Set status to NO_DISPLAY for ignore results
+            for row in dbi.execute(ignore_query).mappings().all():
+                dbi.execute(
+                    update(ObjectXrefUORM)
+                    .values(ox_status="NO_DISPLAY")
+                    .where(ObjectXrefUORM.object_xref_id == row.object_xref_id)
+                )
+
+    def set_transcript_names(self) -> None:
+        logging.info("Assigning transcript names from gene names")
+
+        core_dbi = self.core().connect()
+
+        # Reset transcript display xrefs
+        core_dbi.execute(
+            update(TranscriptORM)
+            .values(display_xref_id=None)
+            .where(TranscriptORM.biotype != "LRG_gene")
+        )
+
+        # Get the max xref and object_xref IDs
+        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
+        xref_id = int(xref_id)
+        object_xref_id = core_dbi.execute(
+            select(func.max(ObjectXrefCORM.object_xref_id))
+        ).scalar()
+        object_xref_id = int(object_xref_id)
+
+        # Get all genes with set display_xref_id
+        query = select(
+            GeneORM.gene_id,
+            ExternalDbORM.db_name,
+            XrefCORM.dbprimary_acc,
+            XrefCORM.display_label,
+            XrefCORM.description,
+        ).where(
+            GeneORM.display_xref_id == XrefCORM.xref_id,
+            XrefCORM.external_db_id == ExternalDbORM.external_db_id,
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            ext = 201
+
+            # Get the ID of transcript name external DB
+            external_db_id = core_dbi.execute(
+                select(ExternalDbORM.external_db_id).where(
+                    ExternalDbORM.db_name.like(f"{row.db_name}_trans_name")
+                )
+            ).scalar()
+
+            if not external_db_id:
+                raise LookupError(
+                    f"No external_db_id found for '{row.db_name}_trans_name'"
+                )
+
+            # Get transcripts related to current gene
+            query = (
+                select(TranscriptORM.transcript_id)
+                .where(TranscriptORM.gene_id == row.gene_id)
+                .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end)
+            )
+            for transcript_row in core_dbi.execute(query).mappings().all():
+                object_xref_id += 1
+
+                display_label = f"{row.display_label}-{ext}"
+
+                # Check if xref already exists
+                insert_xref_id = core_dbi.execute(
+                    select(XrefCORM.xref_id).where(
+                        XrefCORM.external_db_id == external_db_id,
+                        XrefCORM.display_label == display_label,
+                        XrefCORM.info_type == "MISC",
+                    )
+                ).scalar()
+
+                if not insert_xref_id:
+                    xref_id += 1
+                    info_text = f"via gene {row.dbprimary_acc}"
+
+                    # Insert new xref
+                    core_dbi.execute(
+                        insert(XrefCORM)
+                        .values(
+                            xref_id=xref_id,
+                            external_db_id=external_db_id,
+                            dbprimary_acc=display_label,
+                            display_label=display_label,
+                            version=0,
+                            description=row.description,
+                            info_type="MISC",
+                            info_text=info_text,
+                        )
+                        .prefix_with("IGNORE")
+                    )
+
+                    insert_xref_id = xref_id
+
+                # Insert object xref
+                core_dbi.execute(
+                    insert(ObjectXrefCORM).values(
+                        object_xref_id=object_xref_id,
+                        ensembl_id=transcript_row.transcript_id,
+                        ensembl_object_type="Transcript",
+                        xref_id=insert_xref_id,
+                    )
+                )
+
+                # Set transcript dispay xref
+                core_dbi.execute(
+                    update(TranscriptORM)
+                    .values(display_xref_id=insert_xref_id)
+                    .where(TranscriptORM.transcript_id == transcript_row.transcript_id)
+                )
+
+                ext += 1
+
+        # Delete object xrefs with no matching xref
+        query = (
+            select(ObjectXrefCORM.object_xref_id)
+            .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id)
+            .where(XrefCORM.xref_id == None)
+        )
+        result = core_dbi.execute(query).fetchall()
+        object_xref_ids = [row[0] for row in result]
+
+        core_dbi.execute(
+            delete(ObjectXrefCORM).where(
+                ObjectXrefCORM.object_xref_id.in_(object_xref_ids)
+            )
+        )
+
+        core_dbi.close()
+
+    def set_gene_descriptions(self) -> None:
+        logging.info("Setting gene descriptions")
+
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+        mapper = self.mapper()
+
+        # Reset the gene descriptions
+        core_dbi.execute(update(GeneORM).values(description=None))
+
+        # Get external display names
+        name_to_external_name = {}
+        query = select(
+            ExternalDbORM.external_db_id,
+            ExternalDbORM.db_name,
+            ExternalDbORM.db_display_name,
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            name_to_external_name[row.db_name] = row.db_display_name
+
+        # Get source ID to external names mappings
+        if hasattr(mapper, "set_source_id_to_external_name"):
+            source_id_to_external_name, name_to_source_id = (
+                mapper.set_source_id_to_external_name(name_to_external_name, xref_dbi)
+            )
+        else:
+            source_id_to_external_name, name_to_source_id = (
+                self.set_source_id_to_external_name(name_to_external_name, xref_dbi)
+            )
+
+        # Get description source priorities and ignore queries
+        if hasattr(mapper, "gene_description_sources"):
+            precedence_list = mapper.gene_description_sources()
+            ignore = None
+        else:
+            precedence_list, ignore = self.gene_description_sources()
+
+        # Get description regular expressions
+        if hasattr(mapper, "gene_description_filter_regexps"):
+            reg_exps = mapper.gene_description_filter_regexps()
+        else:
+            reg_exps = self.gene_description_filter_regexps()
+
+        # Add the description priorities into the DB
+        priority = 0
+        logging.info("Precedence for Gene descriptions (1- best description)")
+
+        for source_name in precedence_list:
+            priority += 1
+
+            # Get the source ID
+            query = select(SourceUORM.source_id, SourceUORM.name).where(
+                SourceUORM.name.like(source_name)
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                xref_dbi.execute(
+                    insert(GeneDescPriorityORM)
+                    .values(source_id=row.source_id, priority=priority)
+                    .prefix_with("IGNORE")
+                )
+
+            logging.info(f"{priority} - {row.name}")
+
+        # Execute ignore queries
+        self._apply_ignore(ignore, xref_dbi)
+
+        no_source_name_in_desc = {}
+        if hasattr(mapper, "no_source_label_list"):
+            for source_name in mapper.no_source_label_list():
+                source_id = name_to_source_id.get(source_name)
+                if source_id:
+                    logging.info(
+                        f"Source '{name}' will not have [Source:...] info in description"
+                    )
+                    no_source_name_in_desc[source_id] = 1
+
+        gene_desc_updated = {}
+
+        # Build the case statement
+        GTTGene = aliased(GeneTranscriptTranslationORM)
+        GTTTranscript = aliased(GeneTranscriptTranslationORM)
+        GTTTranslation = aliased(GeneTranscriptTranslationORM)
+        gene_case_stmt = case(
+            [
+                (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id),
+                (
+                    ObjectXrefUORM.ensembl_object_type == "Transcript",
+                    GTTTranscript.gene_id,
+                ),
+                (
+                    ObjectXrefUORM.ensembl_object_type == "Translation",
+                    GTTTranslation.gene_id,
+                ),
+            ],
+        ).label("d_gene_id")
+
+        # Get all relevent xrefs for this object type based on precendence sources
+        query = (
+            select(
+                gene_case_stmt,
+                XrefUORM.description,
+                SourceUORM.source_id,
+                XrefUORM.accession,
+                GeneDescPriorityORM.priority,
+            )
+            .join(SourceUORM, SourceUORM.source_id == GeneDescPriorityORM.source_id)
+            .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
+            .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id)
+            .join(
+                IdentityXrefUORM,
+                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+            )
+            .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id)
+            .outerjoin(
+                GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id
+            )
+            .outerjoin(
+                GTTTranslation,
+                GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id,
+            )
+            .where(ObjectXrefUORM.ox_status == "DUMP_OUT")
+            .order_by(
+                "d_gene_id",
+                ObjectXrefUORM.ensembl_object_type,
+                GeneDescPriorityORM.priority,
+                desc(
+                    IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity
+                ),
+            )
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if gene_desc_updated.get(row.d_gene_id):
+                continue
+
+            if row.description:
+                # Apply regular expressions to description
+                filtered_description = self.filter_by_regexp(row.description, reg_exps)
+                if filtered_description != "":
+                    source_name = source_id_to_external_name.get(row.source_id)
+                    filtered_description += (
+                        f" [Source:{source_name};Acc:{row.accession}]"
+                    )
+
+                # Update the gene description
+                core_dbi.execute(
+                    update(GeneORM)
+                    .values(description=filtered_description)
+                    .where(
+                        GeneORM.gene_id == row.d_gene_id, GeneORM.description == None
+                    )
+                )
+
+                gene_desc_updated[row.d_gene_id] = 1
+
+        logging.info(f"{len(gene_desc_updated.keys())} gene descriptions added")
+
+        # Reset ignored object xrefs
+        xref_dbi.execute(
+            update(ObjectXrefUORM)
+            .values(ox_status="DUMP_OUT")
+            .where(ObjectXrefUORM.ox_status == "NO_DISPLAY")
+        )
+
+        xref_dbi.close()
+        core_dbi.close()
+
+    def get_external_name_mappings(self, core_dbi: Connection, xref_dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]:
+        # Get external display names
+        external_name_to_display_name = {}
+        query = select(
+            ExternalDbORM.external_db_id,
+            ExternalDbORM.db_name,
+            ExternalDbORM.db_display_name,
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            external_name_to_display_name[row.db_name] = row.db_display_name
+
+        # Get sources for available xrefs
+        source_id_to_external_name, source_name_to_source_id = {}, {}
+        query = (
+            select(SourceUORM.source_id, SourceUORM.name)
+            .where(SourceUORM.source_id == XrefUORM.source_id)
+            .group_by(SourceUORM.source_id)
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if external_name_to_display_name.get(row.name):
+                source_id_to_external_name[row.source_id] = external_name_to_display_name[row.name]
+                source_name_to_source_id[row.name] = row.source_id
+            elif re.search(r"notransfer$", row.name):
+                logging.info(f"Ignoring notransfer source '{row.name}'")
+            else:
+                raise LookupError(f"Could not find {row.name} in external_db table")
+
+        return source_id_to_external_name, source_name_to_source_id
+
+    def set_source_id_to_external_name(self, name_to_external_name: Dict[str, str], dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]:
+        source_id_to_external_name, name_to_source_id = {}, {}
+
+        # Get sources for available xrefs
+        query = (
+            select(SourceUORM.source_id, SourceUORM.name)
+            .where(SourceUORM.source_id == XrefUORM.source_id)
+            .group_by(SourceUORM.source_id)
+        )
+        for row in dbi.execute(query).mappings().all():
+            if name_to_external_name.get(row.name):
+                source_id_to_external_name[row.source_id] = name_to_external_name[row.name]
+                name_to_source_id[row.name] = row.source_id
+            elif re.search(r"notransfer$", row.name):
+                logging.info(f"Ignoring notransfer source '{row.name}'")
+            else:
+                raise LookupError(f"Could not find {row.name} in external_db table")
+
+        return source_id_to_external_name, name_to_source_id
+
+    def gene_description_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        return self.gene_display_xref_sources()
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        regex = [
+            r"[0-9A-Z]{10}RIK PROTEIN[ \.]",
+            r"\(?[0-9A-Z]{10}RIK PROTEIN\)?[ \.]",
+            r"^BA\S+\s+\(NOVEL PROTEIN\)\.?",
+            r"^BC\d+\_\d+\.?",
+            r"CDNA SEQUENCE\s?,? [A-Z]+\d+[ \.;]",
+            r"^CGI\-\d+ PROTEIN\.?\;?",
+            r"^CHROMOSOME\s+\d+\s+OPEN\s+READING\s+FRAME\s+\d+\.?.*",
+            r"CLONE MGC:\d+[ \.;]",
+            r"^\(CLONE REM\d+\) ORF \(FRAGMENT\)\.*",
+            r"\(CLONE \S+\)\s+",
+            r"^DJ\S+\s+\(NOVEL PROTEIN\)\.?",
+            r"^DKFZP[A-Z0-9]+\s+PROTEIN[\.;]?.*",
+            r"DNA SEGMENT, CHR.*",
+            r"EST [A-Z]+\d+[ \.;]",
+            r"EXPRESSED SEQUENCE [A-Z]+\d+[ \.;]",
+            r"^FKSG\d+\.?.*",
+            r"^FLJ\d+\s+PROTEIN.*",
+            r"^HSPC\d+.*",
+            r"^HSPC\d+\s+PROTEIN\.?.*",
+            r"HYPOTHETICAL PROTEIN,",
+            r"HYPOTHETICAL PROTEIN \S+[\.;]",
+            r"^\(*HYPOTHETICAL\s+.*",
+            r"\(*HYPOTHETICAL\s+.*",
+            r"^KIAA\d+\s+GENE\s+PRODUCT\.?.*",
+            r"^KIAA\d+\s+PROTEIN\.?.*",
+            r"^LOC\d+\s*(PROTEIN)?\.?",
+            r" MGC:\s*\d+[ \.;]",
+            r"MGC:\s*\d+[ \.;]",
+            r"^ORF.*",
+            r"^ORF\s*\d+\s+PROTEIN\.*",
+            r"^PRED\d+\s+PROTEIN.*",
+            r"^PRO\d+\.?.*",
+            r"^PRO\d+\s+PROTEIN\.?.*",
+            r"^PROTEIN C\d+ORF\d+\.*",
+            r"PROTEIN KIAA\d+[ \.].*",
+            r"PROTEIN \S+ HOMOLOG\.?",
+            r"^Putative uncharacterized protein.*",
+            r"R\d{5}_\d[ \.,].*",
+            r"RIKEN CDNA [0-9A-Z]{10}[ \.;]",
+            r"RIKEN CDNA [0-9A-Z]{10}[ \.]",
+            r".*RIKEN FULL-LENGTH ENRICHED LIBRARY.*",
+            r".*RIKEN FULL-LENGTH ENRICHED LIBRARY.*PRODUCT:",
+            r"^\s*\(\d*\)\s*[ \.]$",
+            r"^\s*\(\d*\)\s*[ \.]$",
+            r"^\s*\(?FRAGMENT\)?\.?\s*$",
+            r"^\s*\(FRAGMENT\)\.?\s*$",
+            r"\s*\(?GENE\)?\.?;?",
+            r"^\s*\(?GENE\)?\.?;?\s*$",
+            r"^\s*\(?GENE\)?\.?\s*$",
+            r"SIMILAR TO GENBANK ACCESSION NUMBER\s+\S+",
+            r"^SIMILAR TO GENE.*",
+            r"^SIMILAR TO HYPOTHETICAL.*",
+            r"^SIMILAR TO (KIAA|LOC).*",
+            r"SIMILAR TO (KIAA|LOC|RIKEN).*",
+            r"^SIMILAR TO PUTATIVE[ \.]",
+            r"SIMILAR TO PUTATIVE[ \.]",
+            r"^SIMILAR TO\s+$",
+            r"SIMILAR TO\s+$",
+            r"\s*\(?PRECURSOR\)?\.?;?",
+            r"^\s*\(?PROTEIN\)?\.?\s*$",
+            r"^\s+\(?\s*$",
+            r"^\s*\(\s*\)\s*$",
+            r"^UNKNOWN\s+.*",
+            r"^WUGSC:H_.*",
+            r"^WUGSC:.*\s+PROTEIN\.?.*",
+        ]
+
+        return regex
+
+    def filter_by_regexp(self, string: str, regular_expressions: List[str]) -> str:
+        for regex in regular_expressions:
+            string = re.sub(regex, "", string, flags=re.IGNORECASE)
+
+        return string
+
+    def set_meta_timestamp(self) -> None:
+        with self.core().connect() as dbi:
+            dbi.execute(delete(MetaCORM).where(MetaCORM.meta_key == "xref.timestamp"))
+
+            now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            dbi.execute(
+                insert(MetaCORM).values(meta_key="xref.timestamp", meta_value=now)
+            )
+
+    def set_display_xrefs_from_stable_table(self) -> None:
+        logging.info("Setting Transcript and Gene display xrefs using stable IDs")
+
+        # Get the xref offset used when adding the xrefs into the core DB
+        xref_offset = self.get_meta_value("xref_offset")
+        xref_offset = int(xref_offset)
+        logging.info(f"Using xref offset of {xref_offset}")
+
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        # Reset gene and transcript display xrefs
+        core_dbi.execute(update(GeneORM).values(display_xref_id=None))
+        core_dbi.execute(update(TranscriptORM).values(display_xref_id=None))
+
+        # Remove descriptions with 'Source' field
+        core_dbi.execute(
+            update(GeneORM)
+            .values(description=None)
+            .where(GeneORM.description.like("%[Source:%]%"))
+        )
+
+        # Get external names and IDs
+        name_to_external_name, source_id_to_external_name = {}, {}
+        query = select(
+            ExternalDbORM.external_db_id,
+            ExternalDbORM.db_name,
+            ExternalDbORM.db_display_name,
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            name_to_external_name[row.db_name] = row.db_display_name
+
+        query = (
+            select(SourceUORM.source_id, SourceUORM.name)
+            .where(SourceUORM.source_id == XrefUORM.source_id)
+            .group_by(SourceUORM.source_id)
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if name_to_external_name.get(row.name):
+                source_id_to_external_name[row.source_id] = name_to_external_name[
+                    row.name
+                ]
+
+        gene_count = 0
+
+        # Set gene names and descriptions
+        query = select(
+            GeneStableIdORM.internal_id,
+            GeneStableIdORM.display_xref_id,
+            XrefUORM.description,
+            XrefUORM.source_id,
+            XrefUORM.accession,
+        ).where(GeneStableIdORM.display_xref_id == XrefUORM.xref_id)
+        for row in xref_dbi.execute(query).mappings().all():
+            xref_id = int(row.display_xref_id)
+
+            # Set display xref ID
+            core_dbi.execute(
+                update(GeneORM)
+                .values(display_xref_id=(xref_id + xref_offset))
+                .where(GeneORM.gene_id == row.internal_id)
+            )
+
+            # Set description
+            if row.description is not None and row.description != "":
+                description = f"{row.description} [Source:{source_id_to_external_name[row.source_id]};Acc:{row.accession}]"
+                core_dbi.execute(
+                    update(GeneORM)
+                    .values(description=description)
+                    .where(GeneORM.gene_id == row.internal_id)
+                )
+
+                xref_dbi.execute(
+                    update(GeneStableIdORM)
+                    .values(desc_set=1)
+                    .where(GeneStableIdORM.internal_id == row.internal_id)
+                )
+                gene_count += 1
+
+        logging.info(f"{gene_count} gene descriptions added")
+
+        # Set transcript names and descriptions
+        query = select(
+            TranscriptStableIdORM.internal_id, TranscriptStableIdORM.display_xref_id
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            xref_id = int(row.display_xref_id)
+
+            if xref_id:
+                # Set display xref ID
+                core_dbi.execute(
+                    update(TranscriptORM)
+                    .values(display_xref_id=(xref_id + xref_offset))
+                    .where(TranscriptORM.transcript_id == row.internal_id)
+                )
+
+        # Clean up synonyms linked to xrefs which are not display xrefs
+        query = (
+            select(ExternalSynonymORM)
+            .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id)
+            .where(
+                ExternalSynonymORM.xref_id == XrefCORM.xref_id,
+                GeneORM.display_xref_id == None,
+            )
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            core_dbi.execute(
+                delete(ExternalSynonymORM).where(
+                    ExternalSynonymORM.xref_id == row.xref_id,
+                    ExternalSynonymORM.synonym == row.synonym,
+                )
+            )
+
+        xref_dbi.close()
+        core_dbi.close()
diff --git a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
new file mode 100644
index 000000000..e4c33bf75
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
@@ -0,0 +1,637 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for setting the feature names."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class OfficialNaming(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        self._official_name = mapper.official_name()
+        mapper.set_up_logging()
+
+    def official_name(self, official_name: str = None) -> str:
+        if official_name:
+            self._official_name = official_name
+
+        return self._official_name
+
+    def run(self, species_id: int, verbose: bool) -> None:
+        logging.info("Starting official naming")
+
+        # If no offical name then we do not want to go any further
+        dbname = self.official_name()
+        if not dbname:
+            self.update_process_status("official_naming_done")
+            return
+
+        xref_dbi = self.xref().connect()
+
+        # If there are any official names on transcripts or translations, move them onto gene level
+        if dbname == "MGI":
+            self.biomart_fix("MGI", "Translation", "Gene", xref_dbi)
+            self.biomart_fix("MGI", "Transcript", "Gene", xref_dbi)
+        if dbname == "ZFIN_ID":
+            self.biomart_fix("ZFIN_ID", "Translation", "Gene", xref_dbi)
+            self.biomart_fix("ZFIN_ID", "Transcript", "Gene", xref_dbi)
+        if dbname == "RGD":
+            self.biomart_fix("RGD", "Translation", "Gene", xref_dbi)
+            self.biomart_fix("RGD", "Transcript", "Gene", xref_dbi)
+
+        # Get the current max values for xref and object_xref
+        max_xref_id = xref_dbi.execute(select(func.max(XrefUORM.xref_id))).scalar()
+        max_xref_id = int(max_xref_id)
+        max_object_xref_id = xref_dbi.execute(
+            select(func.max(ObjectXrefUORM.object_xref_id))
+        ).scalar()
+        max_object_xref_id = int(max_object_xref_id)
+
+        # Get labels, descriptions, and synonyms
+        display_label_to_desc = self.get_display_label_data(dbname, xref_dbi)
+        synonyms = self.get_synonyms(dbname, xref_dbi)
+
+        # Get source IDs
+        dbname_to_source_id = self.get_dbname_to_source_id(dbname, xref_id)
+
+        # Reset gene and transcript stable id display data
+        self.reset_display_xrefs(xref_dbi)
+
+        # Get the gene and transcript stable IDs and internal IDs
+        gene_to_transcripts, gene_id_to_stable_id, tran_id_to_stable_id = {}, {}, {}
+        sorted_gene_ids = []
+
+        query = (
+            select(
+                GeneTranscriptTranslationORM.gene_id,
+                GeneTranscriptTranslationORM.transcript_id,
+                GeneStableIdORM.stable_id.label("gene_stable_id"),
+                TranscriptStableIdORM.stable_id.label("transcript_stable_id"),
+            )
+            .where(
+                GeneTranscriptTranslationORM.gene_id == GeneStableIdORM.internal_id,
+                GeneTranscriptTranslationORM.transcript_id
+                == TranscriptStableIdORM.internal_id,
+            )
+            .order_by(GeneStableIdORM.stable_id, TranscriptStableIdORM.stable_id)
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if not gene_to_transcripts.get(row.gene_id):
+                sorted_gene_ids.append(row.gene_id)
+
+            gene_to_transcripts.setdefault(row.gene_id, []).append(row.transcript_id)
+            gene_id_to_stable_id[row.gene_id] = row.gene_stable_id
+            tran_id_to_stable_id[row.transcript_id] = row.transcript_stable_id
+
+        # Get the object xref IDs that we should ignore (EntrezGene xref dependent on RefSeq_predicted xrefs)
+        ignore_object = {}
+
+        MasterXref = aliased(XrefUORM)
+        DependentXref = aliased(XrefUORM)
+
+        MasterSource = aliased(SourceUORM)
+        DependentSource = aliased(SourceUORM)
+
+        query = select(ObjectXrefUORM.object_xref_id.distinct()).where(
+            ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
+            DependentXrefUORM.dependent_xref_id == DependentXref.xref_id,
+            DependentXrefUORM.master_xref_id == MasterXref.xref_id,
+            MasterXref.source_id == MasterSource.source_id,
+            DependentXref.source_id == DependentSource.source_id,
+            MasterSource.name.like("Refseq%predicted"),
+            DependentSource.name.like("EntrezGene"),
+            ObjectXrefUORM.ox_status == "DUMP_OUT",
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            ignore_object[row.object_xref_id] = 1
+
+        xref_added, seen_gene, official_name_used = {}, {}, {}
+
+        # Go through all genes
+        for gene_id in sorted_gene_ids:
+            transcript_source = dbname
+            gene_symbol, gene_symbol_xref_id, is_lrg = None, None, 0
+
+            # Get offical name if it has one
+            gene_symbol, gene_symbol_xref_id = self.get_official_domain_name(
+                {
+                    "gene_id": gene_id,
+                    "gene_id_to_stable_id": gene_id_to_stable_id,
+                    "official_name_used": official_name_used,
+                    "dbname": dbname,
+                    "verbose": verbose,
+                },
+                xref_dbi,
+            )
+
+            if gene_symbol_xref_id:
+                official_name_used[gene_symbol_xref_id] = 1
+
+            # If not found see if there is an LRG entry
+            if not gene_symbol:
+                gene_symbol, gene_symbol_xref_id, is_lrg = self.find_lrg_hgnc(
+                    gene_id, xref_dbi
+                )
+
+            # If not found look for other valid database sources (RFAM and miRBase, EntrezGene)
+            if not gene_symbol:
+                (
+                    gene_symbol,
+                    gene_symbol_xref_id,
+                    transcript_source,
+                    display_label_to_desc,
+                ) = self.find_from_other_sources(
+                    ignore_object,
+                    {
+                        "gene_id": gene_id,
+                        "display_label_to_desc": display_label_to_desc,
+                        "transcript_source": transcript_source,
+                    },
+                    xref_dbi,
+                )
+
+            if gene_symbol:
+                description = display_label_to_desc.get(gene_symbol)
+                xref_dbi.execute(
+                    update(GeneStableIdORM)
+                    .where(GeneStableIdORM.internal_id == gene_id)
+                    .values(display_xref_id=gene_symbol_xref_id)
+                )
+
+                if not is_lrg:
+                    # Set transcript names
+                    max_xref_id, max_object_xref_id, xref_added, seen_gene = (
+                        self.set_transcript_display_xrefs(
+                            {
+                                "max_xref_id": max_xref_id,
+                                "max_object_xref_id": max_object_xref_id,
+                                "gene_id": gene_id,
+                                "gene_id_to_stable_id": gene_id_to_stable_id,
+                                "gene_symbol": gene_symbol,
+                                "description": description,
+                                "source_id": dbname_to_source_id.get(
+                                    f"{transcript_source}_trans_name"
+                                ),
+                                "xref_added": xref_added,
+                                "seen_gene": seen_gene,
+                                "transcript_ids": gene_to_transcripts.get(gene_id, []),
+                                "transcript_source": transcript_source,
+                                "species_id": species_id,
+                            },
+                            xref_dbi,
+                        )
+                    )
+
+        xref_dbi.close()
+
+        self.update_process_status("official_naming_done")
+
+    def get_display_label_data(self, dbname: str, dbi: Connection) -> Dict[str, str]:
+        label_to_desc = {}
+
+        # Connect synonyms to xref descriptions
+        query = select(SynonymORM.synonym, XrefUORM.description).where(
+            XrefUORM.xref_id == SynonymORM.xref_id,
+            SourceUORM.source_id == XrefUORM.source_id,
+            SourceUORM.name.like(dbname),
+        )
+        for row in dbi.execute(query).mappings().all():
+            label_to_desc[row.synonym] = row.description
+
+        # Connect display labels to xref descriptions
+        no_descriptions = 0
+        query = select(XrefUORM.label, XrefUORM.description).where(
+            XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name.like(dbname)
+        )
+        for row in dbi.execute(query).mappings().all():
+            if not row.description:
+                no_descriptions += 1
+            else:
+                label_to_desc[row.label] = row.description
+
+        if no_descriptions:
+            logging.warn(f"Descriptions not defined for {no_descriptions} labels")
+
+        return label_to_desc
+
+    def get_synonyms(self, dbname: str, dbi: Connection) -> Dict[str, str]:
+        synonyms = {}
+
+        # Connect synonyms with xref labels
+        query = select(SynonymORM.synonym, XrefUORM.label).where(
+            XrefUORM.xref_id == SynonymORM.xref_id,
+            SourceUORM.source_id == XrefUORM.source_id,
+            SourceUORM.name.like(dbname),
+        )
+        for row in dbi.execute(query).mappings().all():
+            synonyms[row.synonym] = row.label
+
+        return synonyms
+
+    def get_dbname_to_source_id(self, dbname: str, dbi: Connection) -> Dict[str, int]:
+        dbname_to_source_id = {}
+
+        sources_list = [
+            "RFAM_trans_name",
+            "miRBase_trans_name",
+            "EntrezGene_trans_name",
+        ]
+        sources_list.append(f"{dbname}_trans_name")
+        sources_list.append(dbname)
+
+        source_error = 0
+        for source_name in sources_list:
+            source_id = dbi.execute(
+                select(SourceUORM.source_id).where(SourceUORM.name.like(source_name))
+            ).scalar()
+
+            if not source_id:
+                logging.warn(f"Could not find external database '{source_name}'")
+                source_error += 1
+            else:
+                dbname_to_source_id[source_name] = source_id
+
+        if source_error:
+            raise LookupError(
+                f"Could not find name for {source_error} databases. Therefore Exiting. Please add these sources"
+            )
+
+        return dbname_to_source_id
+
+    def reset_display_xrefs(self, dbi: Connection) -> None:
+        dbi.execute(update(TranscriptStableIdORM).values(display_xref_id=None))
+
+        dbi.execute(update(GeneStableIdORM).values(display_xref_id=None, desc_set=0))
+
+    def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tuple[str, int]:
+        gene_id = args["gene_id"]
+        gene_id_to_stable_id = args["gene_id_to_stable_id"]
+        official_name_used = args["official_name_used"]
+        dbname = args["dbname"]
+        verbose = args["verbose"]
+
+        gene_symbol, gene_symbol_xref_id = None, None
+        display_names, xref_id_to_display = {}, {}
+        best_level, name_count = 999, 0
+        xref_ids_list, object_xref_ids_list = [], []
+
+        # Get the display labels mapped to the gene ID, and extract the ones with the highest priority
+        query = select(
+            XrefUORM.label,
+            XrefUORM.xref_id,
+            ObjectXrefUORM.object_xref_id,
+            SourceUORM.priority,
+        ).where(
+            XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+            XrefUORM.source_id == SourceUORM.source_id,
+            SourceUORM.name == dbname,
+            ObjectXrefUORM.ox_status == "DUMP_OUT",
+            ObjectXrefUORM.ensembl_id == gene_id,
+            ObjectXrefUORM.ensembl_object_type == "Gene",
+        )
+        for row in dbi.execute(query).mappings().all():
+            xref_ids_list.append(row.xref_id)
+            object_xref_ids_list.append(row.object_xref_id)
+            xref_id_to_display[row.xref_id] = row.label
+
+            name_count += 1
+
+            if row.priority < best_level:
+                display_names.clear()
+                display_names[row.xref_id] = 1
+                best_level = row.priority
+            elif row.priority == best_level:
+                display_names[row.xref_id] = 1
+
+        # Check if the best names has been found, and remove the others if so
+        if name_count > 1 and len(display_names) == 1:
+            if verbose:
+                logging.info(
+                    f"For gene {gene_id_to_stable_id[gene_id]}, we have multiple {dbname} names"
+                )
+
+            gene_symbol, gene_symbol_xref_id = self.set_the_best_display_name(
+                display_names,
+                xref_ids_list,
+                object_xref_ids_list,
+                xref_id_to_display,
+                verbose,
+                dbi,
+            )
+            if gene_symbol:
+                return gene_symbol, gene_symbol_xref_id
+
+        # Perfect case, one best name found
+        if len(display_names) == 1:
+            xref_id = display_names.keys()[0]
+            return xref_id_to_display[xref_id], xref_id
+
+        # Try to find the best names out of multiple ones
+        if len(display_names) > 1:
+            temp_best_identity = 0
+            best_ids, best_list = [], []
+
+            # Fail xrefs with worse % identity if we can (query or target identity whichever is greater)
+            case_stmt = case(
+                [
+                    (
+                        IdentityXrefUORM.query_identity
+                        >= IdentityXrefUORM.target_identity,
+                        IdentityXrefUORM.query_identity,
+                    )
+                ],
+                else_=IdentityXrefUORM.target_identity,
+            ).label("best_identity")
+            query = (
+                select(XrefUORM.xref_id, case_stmt)
+                .where(
+                    XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
+                    SourceUORM.name == dbname,
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                    ObjectXrefUORM.ensembl_id == gene_id,
+                    ObjectXrefUORM.ensembl_object_type == "Gene",
+                )
+                .order_by(desc("best_identity"))
+            )
+            for row in dbi.execute(query).mappings().all():
+                if row.best_identity > temp_best_identity:
+                    best_ids.clear()
+                    best_ids[row.xref_id] = 1
+                    temp_best_identity = row.best_identity
+                elif row.best_identity == temp_best_identity:
+                    best_ids[row.xref_id] = 1
+                else:
+                    break
+
+            for xref_id in display_names.keys():
+                best_list[xref_id_to_display[xref_id]] = 1
+
+            # Check if we were able to reduce the number of xrefs based on % identity
+            if len(best_ids) > 0 and len(best_ids) < len(display_names):
+                display_names = best_ids
+                if verbose:
+                    logging.info(
+                        f"For gene {gene_id_to_stable_id[gene_id]}, we have multiple {dbname} names"
+                    )
+
+                gene_symbol, gene_symbol_xref_id = self.set_the_best_display_name(
+                    display_names,
+                    xref_ids_list,
+                    object_xref_ids_list,
+                    xref_id_to_display,
+                    verbose,
+                    dbi,
+                )
+                if gene_symbol and len(display_names) == 1:
+                    return gene_symbol, gene_symbol_xref_id
+
+            # Take the name which hasn't been already assigned to another gene, if possible
+            xref_not_used = None
+            for xref_id in display_names.keys():
+                if not official_name_used.get(xref_id):
+                    xref_not_used = xref_id
+
+            if xref_not_used:
+                if verbose:
+                    logging.info(f"For gene {gene_id_to_stable_id[gene_id]}:")
+                for xref_id in display_names.keys():
+                    if xref_id == xref_not_used:
+                        if verbose:
+                            logging.info(f"\t{xref_id_to_display[xref_id]} chosen")
+                        gene_symbol = xref_id_to_display[xref_id]
+                        gene_symbol_xref_id = xref_id
+                    else:
+                        if verbose:
+                            logging.info(
+                                f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)"
+                            )
+            else:
+                index = 0
+                for xref_id in display_names.keys():
+                    if not index:
+                        if verbose:
+                            logging.info(
+                                f"\t{xref_id_to_display[xref_id]} chosen as first"
+                            )
+                        gene_symbol = xref_id_to_display[xref_id]
+                        gene_symbol_xref_id = xref_id
+                    else:
+                        if verbose:
+                            logging.info(
+                                f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)"
+                            )
+                    index += 1
+
+        return gene_symbol, gene_symbol_xref_id
+
+    def set_the_best_display_name(self, display_names: Dict[int, int], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]:
+        gene_symbol, gene_symbol_xref_id = None, None
+
+        for xref_id in xref_list:
+            # Remove object xrefs that are not in the best display names list
+            if not display_names.get(xref_id):
+                if verbose:
+                    logging.info(f"Removing {xref_id_to_display[xref_id]} from gene")
+                self.update_object_xref_status(
+                    object_xref_list[xref_id], "MULTI_DELETE", dbi
+                )
+            else:
+                if verbose:
+                    logging.info(f"Keeping the best one {xref_id_to_display[xref_id]}")
+                gene_symbol = xref_id_to_display[xref_id]
+                gene_symbol_xref_id = xref_id
+
+        return gene_symbol, gene_symbol_xref_id
+
+    def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]:
+        gene_symbol, gene_symbol_xref_id = None, None
+        is_lrg = False
+
+        # Look for LRG_HGNC_notransfer, if found then find HGNC equiv and set to this
+        query = select(
+            XrefUORM.label,
+            XrefUORM.xref_id,
+            ObjectXrefUORM.object_xref_id,
+            SourceUORM.priority,
+        ).where(
+            XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+            XrefUORM.source_id == SourceUORM.source_id,
+            SourceUORM.name == "LRG_HGNC_notransfer",
+            ObjectXrefUORM.ensembl_id == gene_id,
+            ObjectXrefUORM.ensembl_object_type == "Gene",
+        )
+        for row in dbi.execute(query).mappings().all():
+            # Set status to NO_DISPLAY as we do not want this transferred, just the equivalent hgnc
+            self.update_object_xref_status(row.object_xref_id, "NO_DISPLAY")
+
+            new_xref_id, priority = None, None
+            query = (
+                select(XrefUORM.xref_id, SourceUORM.priority)
+                .where(
+                    XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    XrefUORM.label == row.label,
+                    SourceUORM.name == "HGNC",
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                )
+                .order_by(SourceUORM.priority)
+            )
+            result = dbi.execute(query).fetchall()
+            if result:
+                new_xref_id, priority = result[0]
+
+            if new_xref_id:
+                gene_symbol = row.label
+                gene_symbol_xref_id = new_xref_id
+                is_lrg = True
+
+        return gene_symbol, gene_symbol_xref_id, is_lrg
+
+    def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any], dbi: Connection) -> Tuple[str, int, str, Dict[str, str]]:
+        gene_id = args["gene_id"]
+        display_label_to_desc = args["display_label_to_desc"]
+        transcript_source = args["transcript_source"]
+
+        gene_symbol, gene_symbol_xref_id = None, None
+        other_name_number, found_gene = {}, {}
+
+        for dbname in ["miRBase", "RFAM", "EntrezGene"]:
+            query = select(
+                XrefUORM.label,
+                XrefUORM.xref_id,
+                ObjectXrefUORM.object_xref_id,
+                XrefUORM.description,
+            ).where(
+                XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+                SourceUORM.name == dbname,
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                ObjectXrefUORM.ensembl_id == gene_id,
+                ObjectXrefUORM.ensembl_object_type == "Gene",
+            )
+            for row in dbi.execute(query).mappings().all():
+                if found_gene.get(gene_id):
+                    break
+                if re.search(r"^LOC", row.label) or re.search(r"^SSC", row.label):
+                    continue
+                if ignore.get(row.object_xref_id):
+                    continue
+
+                gene_symbol = row.label
+                gene_symbol_xref_id = row.xref_id
+                transcript_source = dbname
+                display_label_to_desc[row.label] = row.description
+
+                if other_name_number.get(gene_symbol):
+                    other_name_number[gene_symbol] += 1
+                else:
+                    other_name_number[gene_symbol] = 1
+
+                if dbname != "EntrezGene":
+                    gene_symbol = f"{gene_symbol}.{other_name_number[gene_symbol]}"
+
+                found_gene[gene_id] = 1
+
+        return gene_symbol, gene_symbol_xref_id, transcript_source, display_label_to_desc
+
+    def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> Tuple[int, int, Dict[str, int], Dict[str, int]]:
+        max_xref_id = args["max_xref_id"]
+        max_object_xref_id = args["max_object_xref_id"]
+        gene_id = args["gene_id"]
+        gene_id_to_stable_id = args["gene_id_to_stable_id"]
+        gene_symbol = args["gene_symbol"]
+        description = args["description"]
+        source_id = args["source_id"]
+        xref_added = args["xref_added"]
+        seen_gene = args["seen_gene"]
+        transcript_ids = args["transcript_ids"]
+        transcript_source = args["transcript_source"]
+        species_id = args["species_id"]
+
+        # Do nothing is LRG
+        if re.search("LRG", gene_id_to_stable_id.get(gene_id)):
+            return
+
+        ext = 201
+        if seen_gene.get(gene_symbol):
+            ext = seen_gene[gene_symbol]
+
+        # Go thourgh transcripts
+        for transcript_id in transcript_ids:
+            transcript_name = f"{gene_symbol}-{ext}"
+
+            if not source_id:
+                raise LookupError(
+                    f"transcript_name = {transcript_name} for transcript_id {transcript_id} but NO source_id for this entry for {transcript_source}???"
+                )
+
+            index = f"{transcript_name}:{source_id}"
+            if not xref_added.get(index):
+                # Add new xref for the transcript name
+                max_xref_id += 1
+                dbi.execute(
+                    insert(XrefUORM)
+                    .values(
+                        xref_id=max_xref_id,
+                        source_id=source_id,
+                        accession=transcript_name,
+                        label=transcript_name,
+                        version=0,
+                        species_id=species_id,
+                        info_type="MISC",
+                        info_text="",
+                        description=description,
+                    )
+                    .prefix_with("IGNORE")
+                )
+
+                xref_added[index] = max_xref_id
+
+            # Update the transcript display xref
+            dbi.execute(
+                update(TranscriptStableIdORM)
+                .where(TranscriptStableIdORM.internal_id == transcript_id)
+                .values(display_xref_id=xref_added[index])
+            )
+
+            # Add a corresponding object and identity xrefs
+            max_object_xref_id += 1
+            dbi.execute(
+                insert(ObjectXrefUORM).values(
+                    object_xref_id=max_object_xref_id,
+                    ensembl_id=transcript_id,
+                    ensembl_object_type="Transcript",
+                    xref_id=xref_added[index],
+                    linkage_type="MISC",
+                    ox_status="DUMP_OUT",
+                )
+            )
+
+            dbi.execute(
+                insert(IdentityXrefUORM).values(
+                    object_xref_id=max_object_xref_id,
+                    query_identity=100,
+                    target_identity=100,
+                )
+            )
+
+            ext += 1
+
+        seen_gene[gene_symbol] = ext
+
+        return max_xref_id, max_object_xref_id, xref_added, seen_gene
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py
new file mode 100644
index 000000000..53832520c
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py
@@ -0,0 +1,382 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing sequence matched xref data."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class ProcessMappings(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def process_mappings(self) -> None:
+        xref_dbi = self.xref().connect()
+
+        query_cutoff, target_cutoff = {}, {}
+
+        # Get cutoffs per mapping job
+        mapping_query = select(
+            MappingORM.job_id,
+            MappingORM.percent_query_cutoff,
+            MappingORM.percent_target_cutoff,
+        )
+        for mapping in xref_dbi.execute(mapping_query).mappings().all():
+            query_cutoff[mapping.job_id] = mapping.percent_query_cutoff
+            target_cutoff[mapping.job_id] = mapping.percent_target_cutoff
+
+        already_processed_count, processed_count, error_count, empty_count = 0, 0, 0, 0
+
+        # Go through mapping jobs
+        mapping_query = select(
+            MappingJobsORM.root_dir,
+            MappingJobsORM.map_file,
+            MappingJobsORM.status,
+            MappingJobsORM.out_file,
+            MappingJobsORM.err_file,
+            MappingJobsORM.array_number,
+            MappingJobsORM.job_id,
+        )
+        for mapping_job in xref_dbi.execute(mapping_query).mappings().all():
+            root_dir = mapping_job.root_dir
+            if root_dir is None:
+                root_dir = ""
+
+            err_file = os.path.join(root_dir, mapping_job.err_file)
+            out_file = os.path.join(root_dir, mapping_job.out_file)
+            map_file = os.path.join(root_dir, mapping_job.map_file)
+
+            update_status = None
+
+            if mapping_job.status == "SUCCESS":
+                already_processed_count += 1
+            else:
+                if os.path.exists(err_file) and os.path.getsize(err_file) > 0:
+                    error_count += 1
+
+                    # Display errors on STDERR
+                    logging.warning(f"Problem {err_file} is non zero")
+                    try:
+                        with open(err_file) as fh:
+                            for line in fh:
+                                logging.warning(f"#{line}")
+                    except:
+                        logging.debug(
+                            f"No error file exists {err_file}???\n Resubmit this job"
+                        )
+
+                    if mapping_job.status == "SUBMITTED":
+                        update_status = "FAILED"
+                else:
+                    # Process the mapping file
+                    if os.path.exists(map_file):
+                        count = self.process_map_file(
+                            map_file,
+                            query_cutoff[mapping_job.job_id],
+                            target_cutoff[mapping_job.job_id],
+                            mapping_job.job_id,
+                            mapping_job.array_number,
+                            xref_dbi,
+                        )
+                        if count > 0:
+                            processed_count += 1
+                            update_status = "SUCCESS"
+                        elif count == 0:
+                            processed_count += 1
+                            empty_count += 1
+                            update_status = "SUCCESS"
+                        else:
+                            error_count += 1
+                            update_status = "FAILED"
+                    else:
+                        error_count += 1
+                        logging.debug(
+                            f"Could not open map file {map_file}???\n Resubmit this job"
+                        )
+                        update_status = "FAILED"
+
+                # Update mapping job status
+                if update_status:
+                    xref_dbi.execute(
+                        update(MappingJobsORM)
+                        .where(
+                            MappingJobsORM.job_id == mapping_job.job_id,
+                            MappingJobsORM.array_number == mapping_job.array_number,
+                        )
+                        .values(status=update_status)
+                    )
+
+        logging.info(
+            f"Already processed = {already_processed_count}, processed = {processed_count}, errors = {error_count}, empty = {empty_count}"
+        )
+
+        xref_dbi.close()
+
+        if not error_count:
+            self.update_process_status("mapping_processed")
+
+    def process_map_file(self, map_file: str, query_cutoff: int, target_cutoff: int, job_id: int, array_number: int, dbi: Connection) -> int:
+        ensembl_type = "Translation"
+        if re.search("dna_", map_file):
+            ensembl_type = "Transcript"
+
+        # Get max object xref id
+        object_xref_id = dbi.execute(
+            select(func.max(ObjectXrefUORM.object_xref_id))
+        ).scalar()
+        if not object_xref_id:
+            object_xref_id = 0
+
+        total_lines, last_query_id = 0, 0
+        best_match_found, best_identity, best_score = 0, 0, 0
+        first = 1
+
+        mRNA_biotypes = {
+            "protein_coding": 1,
+            "TR_C_gene": 1,
+            "IG_V_gene": 1,
+            "nonsense_mediated_decay": 1,
+            "polymorphic_pseudogene": 1,
+        }
+
+        try:
+            mh = open(map_file)
+        except:
+            logging.debug(f"Could not open map file {map_file}\n Resubmit this job")
+            return -1
+
+        for line in mh:
+            load_object_xref = 0
+            total_lines += 1
+
+            (
+                label,
+                query_id,
+                target_id,
+                identity,
+                query_length,
+                target_length,
+                query_start,
+                query_end,
+                target_start,
+                target_end,
+                cigar_line,
+                score,
+            ) = line.strip().split(":")
+
+            # Fix varibale types (for integer comparisons)
+            identity = int(identity)
+            score = int(score)
+            query_length = int(query_length)
+            target_length = int(target_length)
+            query_start = int(query_start)
+            target_start = int(target_start)
+
+            if last_query_id != query_id:
+                best_match_found = 0
+                best_score = 0
+                best_identity = 0
+            else:
+                # Ignore mappings with worse identity or score if we already found a good mapping
+                if (
+                    identity < best_identity or score < best_score
+                ) and best_match_found:
+                    continue
+
+            if ensembl_type == "Translation":
+                load_object_xref = 1
+            else:
+                # Check if source name is RefSeq_ncRNA or RefSeq_mRNA
+                # If yes check biotype, if ok store object xref
+                source_name = dbi.execute(
+                    select(SourceUORM.name)
+                    .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
+                    .where(XrefUORM.xref_id == query_id)
+                ).scalar()
+
+                if source_name and (
+                    re.search(r"^RefSeq_(m|nc)RNA", source_name)
+                    or re.search(r"^miRBase", source_name)
+                    or re.search(r"^RFAM", source_name)
+                ):
+                    # Make sure mRNA xrefs are matched to protein_coding biotype only
+                    biotype = dbi.execute(
+                        select(TranscriptStableIdORM.biotype).where(
+                            TranscriptStableIdORM.internal_id == target_id
+                        )
+                    ).scalar()
+
+                    if re.search(r"^RefSeq_mRNA", source_name) and mRNA_biotypes.get(
+                        biotype
+                    ):
+                        load_object_xref = 1
+                    if re.search(
+                        r"^RefSeq_ncRNA", source_name
+                    ) and not mRNA_biotypes.get(biotype):
+                        load_object_xref = 1
+                    if (
+                        re.search(r"^miRBase", source_name)
+                        or re.search(r"^RFAM", source_name)
+                    ) and re.search("RNA", biotype):
+                        load_object_xref = 1
+                else:
+                    load_object_xref = 1
+
+            last_query_id = query_id
+
+            # Check if found a better match
+            if score > best_score or identity > best_identity:
+                best_score = score
+                best_identity = identity
+
+            if not load_object_xref:
+                continue
+            else:
+                best_match_found = 1
+
+            if not score:
+                self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
+                raise ValueError(f"No score on line. Possible file corruption\n{line}")
+
+            # Calculate percentage identities
+            query_identity = int(100 * identity / query_length)
+            target_identity = int(100 * identity / target_length)
+
+            # Only keep alignments where both sequences match cutoff
+            status = "DUMP_OUT"
+            if query_identity < query_cutoff or target_identity < target_cutoff:
+                status = "FAILED_CUTOFF"
+
+            # Add object xref row
+            object_xref_id = self.get_object_xref_id(
+                target_id, query_id, ensembl_type, "SEQUENCE_MATCH", dbi, None, status
+            )
+            if object_xref_id:
+                continue
+            else:
+                try:
+                    object_xref_id = self.add_object_xref(
+                        target_id,
+                        query_id,
+                        ensembl_type,
+                        "SEQUENCE_MATCH",
+                        dbi,
+                        None,
+                        status,
+                    )
+                except:
+                    self.update_object_xref_end(
+                        job_id, array_number, object_xref_id, dbi
+                    )
+                    raise IOError(f"Problem adding object_xref row")
+
+            if first:
+                self.update_object_xref_start(job_id, array_number, object_xref_id, dbi)
+                first = 0
+
+            cigar_line = re.sub(" ", "", cigar_line)
+            cigar_line = re.sub(r"([MDI])(\d+)", r"\2\1", cigar_line)
+
+            # Add identity xref row
+            try:
+                identity_xref_query = insert(IdentityXrefUORM).values(
+                    object_xref_id=object_xref_id,
+                    query_identity=query_identity,
+                    target_identity=target_identity,
+                    hit_start=query_start + 1,
+                    hit_end=query_end,
+                    translation_start=target_start + 1,
+                    translation_end=target_end,
+                    cigar_line=cigar_line,
+                    score=score,
+                )
+                dbi.execute(identity_xref_query)
+            except:
+                self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
+                raise IOError(f"Problem loading identity_xref")
+
+            master_xref_ids = [query_id]
+            for master_xref_id in master_xref_ids:
+                # Get all dependents related to master xref
+                dep_query = select(DependentXrefUORM.dependent_xref_id).where(
+                    DependentXrefUORM.master_xref_id == master_xref_id
+                )
+                for dep in dbi.execute(dep_query).mappings().all():
+                    # Add dependent object xref
+                    dep_object_xref_id = self.get_object_xref_id(
+                        target_id,
+                        dep.dependent_xref_id,
+                        ensembl_type,
+                        "DEPENDENT",
+                        dbi,
+                        master_xref_id,
+                        status,
+                    )
+                    if dep_object_xref_id:
+                        continue
+                    else:
+                        try:
+                            dep_object_xref_id = self.add_object_xref(
+                                target_id,
+                                dep.dependent_xref_id,
+                                ensembl_type,
+                                "DEPENDENT",
+                                dbi,
+                                master_xref_id,
+                                status,
+                            )
+                        except:
+                            self.update_object_xref_end(
+                                job_id, array_number, object_xref_id, dbi
+                            )
+                            raise IOError(f"Problem adding dependent object xref row")
+
+                    # Add dependent identity xref
+                    dbi.execute(
+                        insert(IdentityXrefUORM).values(
+                            object_xref_id=dep_object_xref_id,
+                            query_identity=query_identity,
+                            target_identity=target_identity,
+                        )
+                    )
+
+                    # Get the dependent dependents just in case
+                    master_xref_ids.append(dep.dependent_xref_id)
+
+        mh.close()
+
+        self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
+        return total_lines
+
+    def update_object_xref_end(self, job_id: int, array_number: int, object_xref_id: int, dbi: Connection) -> None:
+        dbi.execute(
+            update(MappingJobsORM)
+            .where(
+                MappingJobsORM.job_id == job_id,
+                MappingJobsORM.array_number == array_number,
+            )
+            .values(object_xref_end=object_xref_id)
+        )
+
+    def update_object_xref_start(self, job_id: int, array_number: int, object_xref_id: int, dbi: Connection) -> None:
+        dbi.execute(
+            update(MappingJobsORM)
+            .where(
+                MappingJobsORM.job_id == job_id,
+                MappingJobsORM.array_number == array_number,
+            )
+            .values(object_xref_start=object_xref_id)
+        )
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py
new file mode 100644
index 000000000..c086cab01
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py
@@ -0,0 +1,478 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for moving xref data onto appriopriate genes."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class ProcessMoves(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def biomart_testing(self, verbose: bool) -> None:
+        logging.info("Starting biomart testing")
+
+        xref_dbi = self.xref().connect()
+
+        again = 1
+        while again:
+            again = 0
+
+            last_type, last_count, last_name = None, None, "DEFAULT"
+
+            query = (
+                select(
+                    ObjectXrefUORM.ensembl_object_type,
+                    SourceUORM.name,
+                    func.count(ObjectXrefUORM.object_xref_id).label("count"),
+                )
+                .where(
+                    XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                    SourceUORM.source_id == XrefUORM.source_id,
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                )
+                .group_by(SourceUORM.name, ObjectXrefUORM.ensembl_object_type)
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                if again:
+                    break
+
+                if last_name == row.name:
+                    again = 1
+                    self.biomart_fix(
+                        row.name, last_type, row.ensembl_object_type, xref_dbi
+                    )
+
+                last_name = row.name
+                last_type = row.ensembl_object_type
+                last_count = row.count
+
+        if self.unlinked_entries(verbose, xref_dbi):
+            raise ValueError("Problems found before source_defined_move")
+
+        xref_dbi.close()
+
+        self.update_process_status("biomart_test_finished")
+
+    def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool:
+        failed = False
+        xref_id, count = None, None
+
+        self.update_process_status("tests_started")
+
+        # Get count of unlinked master xrefs
+        count = dbi.execute(
+            select(func.count(DependentXrefUORM.master_xref_id))
+            .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id)
+            .where(XrefUORM.xref_id == None)
+        ).scalar()
+
+        if count:
+            failed = True
+            logging.error(f"Problem with {count} master xrefs")
+
+            if verbose:
+                query = (
+                    select(DependentXrefUORM.master_xref_id.distinct())
+                    .outerjoin(
+                        XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id
+                    )
+                    .where(XrefUORM.xref_id == None)
+                    .limit(10)
+                )
+                for row in dbi.execute(query).mappings().all():
+                    logging.error(f"Problem with master xref {row.master_xref_id}")
+
+        # Get count of unlinked dependent xrefs
+        count = dbi.execute(
+            select(func.count(DependentXrefUORM.dependent_xref_id))
+            .outerjoin(
+                XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id
+            )
+            .where(XrefUORM.xref_id == None)
+        ).scalar()
+
+        if count:
+            failed = True
+            logging.error(f"Problem with {count} dependent xrefs")
+
+            if verbose:
+                query = (
+                    select(DependentXrefUORM.dependent_xref_id.distinct())
+                    .outerjoin(
+                        XrefUORM,
+                        XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
+                    )
+                    .where(XrefUORM.xref_id == None)
+                    .limit(10)
+                )
+                for row in dbi.execute(query).mappings().all():
+                    logging.error(
+                        f"Problem with dependent xref {row.dependent_xref_id}"
+                    )
+
+        # Get count of unlinked primary xrefs
+        count = dbi.execute(
+            select(func.count(PrimaryXrefORM.xref_id))
+            .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id)
+            .where(XrefUORM.xref_id == None)
+        ).scalar()
+
+        if count:
+            failed = True
+            logging.error(f"Problem with {count} primary xrefs")
+
+            if verbose:
+                query = (
+                    select(PrimaryXrefORM.xref_id.distinct())
+                    .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id)
+                    .where(XrefUORM.xref_id == None)
+                    .limit(10)
+                )
+                for row in dbi.execute(query).mappings().all():
+                    logging.error(f"Problem with primary xref {row.xref_id}")
+
+        db_tables = {
+            "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM},
+            "transcript": {
+                "direct": TranscriptDirectXrefORM,
+                "stable_id": TranscriptStableIdORM,
+            },
+            "translation": {
+                "direct": TranslationDirectXrefORM,
+                "stable_id": TranslationStableIdORM,
+            },
+        }
+
+        # Get count of unlinked direct xrefs
+        for object_type in ["transcript", "translation", "gene"]:
+            direct_table = db_tables[object_type]["direct"]
+            count = dbi.execute(
+                select(func.count(direct_table.general_xref_id))
+                .outerjoin(XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id)
+                .where(XrefUORM.xref_id == None)
+            ).scalar()
+
+            if count:
+                failed = True
+                logging.error(f"Problem with {count} {object_type} direct xrefs")
+
+                if verbose:
+                    query = (
+                        select(direct_table.general_xref_id.distinct())
+                        .outerjoin(
+                            XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id
+                        )
+                        .where(XrefUORM.xref_id == None)
+                        .limit(10)
+                    )
+                    for row in dbi.execute(query).mappings().all():
+                        logging.error(
+                            f"Problem with {object_type} direct xref {row.general_xref_id}"
+                        )
+
+        # Get count of unlinked synonyms
+        count = dbi.execute(
+            select(func.count(SynonymORM.xref_id))
+            .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id)
+            .where(XrefUORM.xref_id == None)
+        ).scalar()
+
+        if count:
+            failed = True
+            logging.error(f"Problem with {count} synonyms")
+
+            if verbose:
+                query = (
+                    select(SynonymORM.xref_id.distinct())
+                    .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id)
+                    .where(XrefUORM.xref_id == None)
+                    .limit(10)
+                )
+                for row in dbi.execute(query).mappings().all():
+                    logging.error(f"Problem with synonym {row.xref_id}")
+
+        # Get count of unlinked identity object xrefs
+        count = dbi.execute(
+            select(func.count(IdentityXrefUORM.object_xref_id))
+            .outerjoin(
+                ObjectXrefUORM,
+                ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
+            )
+            .where(ObjectXrefUORM.object_xref_id == None)
+        ).scalar()
+
+        if count:
+            failed = True
+            logging.error(f"Problem with {count} object xrefs")
+
+            if verbose:
+                query = (
+                    select(IdentityXrefUORM.object_xref_id.distinct())
+                    .outerjoin(
+                        ObjectXrefUORM,
+                        ObjectXrefUORM.object_xref_id
+                        == IdentityXrefUORM.object_xref_id,
+                    )
+                    .where(ObjectXrefUORM.object_xref_id == None)
+                    .limit(10)
+                )
+                for row in dbi.execute(query).mappings().all():
+                    logging.error(f"Problem with object xref {row.object_xref_id}")
+
+        # Get count of unlinked objects
+        for object_type in ["transcript", "translation", "gene"]:
+            id_column = getattr(GeneTranscriptTranslationORM, f"{object_type}_id")
+            stable_id_table = db_tables[object_type]["stable_id"]
+
+            count = dbi.execute(
+                select(func.count(id_column))
+                .outerjoin(stable_id_table, stable_id_table.internal_id == id_column)
+                .where(stable_id_table.internal_id == None, id_column != None)
+            ).scalar()
+
+            if count:
+                failed = True
+                logging.error(f"Problem with {count} {object_type}_ids")
+
+                if verbose:
+                    query = (
+                        select(id_column.label("object_id").distinct())
+                        .outerjoin(
+                            stable_id_table, stable_id_table.internal_id == id_column
+                        )
+                        .where(stable_id_table.internal_id == None, id_column != None)
+                        .limit(10)
+                    )
+                    for row in dbi.execute(query).mappings().all():
+                        logging.error(f"Problem with {object_type}_id {row.object_id}")
+
+        if not failed:
+            self.update_process_status("tests_finished")
+        else:
+            self.update_process_status("tests_failed")
+
+        return failed
+
+    def source_defined_move(self, verbose: bool) -> None:
+        xref_dbi = self.xref().connect()
+
+        for source in self.get_gene_specific_list(xref_dbi):
+            self.biomart_fix(source, "Translation", "Gene", xref_dbi)
+            self.biomart_fix(source, "Transcript", "Gene", xref_dbi)
+
+        if self.unlinked_entries(verbose, xref_dbi):
+            raise ValueError("Problems found after source_defined_move")
+
+        xref_dbi.close()
+
+        self.update_process_status("source_level_move_finished")
+
+    def get_gene_specific_list(self, dbi: Connection) -> List[str]:
+        sources_list = [
+            "DBASS3",
+            "DBASS5",
+            "EntrezGene",
+            "miRBase",
+            "RFAM",
+            "TRNASCAN_SE",
+            "RNAMMER",
+            "UniGene",
+            "Uniprot_gn",
+            "WikiGene",
+            "MIM_GENE",
+            "MIM_MORBID",
+            "HGNC",
+            "MGI",
+            "ZFIN_ID",
+            "FlyBaseName_gene",
+            "RGD",
+            "SGD_GENE",
+            "VGNC",
+            "wormbase_gseqname",
+            "wormbase_locus",
+            "Xenbase",
+            "GeneCards",
+        ]
+
+        used_list = []
+        count = None
+
+        # Check that the sources are used in the database considered
+        for source in sources_list:
+            count = dbi.execute(
+                select(func.count(XrefUORM.xref_id)).where(
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    SourceUORM.name == source,
+                )
+            ).scalar()
+
+            if count > 0:
+                used_list.append(source)
+
+        return used_list
+
+    def process_alt_alleles(self, verbose: bool) -> None:
+        logging.info("Processing alt alleles")
+
+        xref_dbi = self.xref().connect()
+
+        alt_to_ref, ref_to_alts = self.get_alt_allele_hashes(xref_dbi)
+        gene_specific_list = self.get_gene_specific_list(xref_dbi)
+
+        move_count, del_identity_xref_count, del_object_xref_count = 0, 0, 0
+
+        for gene_id, ref_gene in alt_to_ref.items():
+            # Move the xrefs onto the reference Gene
+            query = (
+                update(ObjectXrefUORM)
+                .where(
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                    ObjectXrefUORM.ensembl_id == gene_id,
+                    ObjectXrefUORM.ensembl_object_type == "Gene",
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                    SourceUORM.name.in_(gene_specific_list),
+                )
+                .values(ensembl_id=ref_gene)
+                .prefix_with("IGNORE")
+            )
+            row_count = xref_dbi.execute(query).rowcount
+            move_count += row_count
+
+            # Delete the related identity and object xrefs
+            query = delete(IdentityXrefUORM).where(
+                XrefUORM.source_id == SourceUORM.source_id,
+                ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                ObjectXrefUORM.ensembl_id == gene_id,
+                ObjectXrefUORM.ensembl_object_type == "Gene",
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                SourceUORM.name.in_(gene_specific_list),
+            )
+            row_count = xref_dbi.execute(query).rowcount
+            del_identity_xref_count += row_count
+
+            query = delete(ObjectXrefUORM).where(
+                XrefUORM.source_id == SourceUORM.source_id,
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                ObjectXrefUORM.ensembl_id == gene_id,
+                ObjectXrefUORM.ensembl_object_type == "Gene",
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                SourceUORM.name.in_(gene_specific_list),
+            )
+            row_count = xref_dbi.execute(query).rowcount
+            del_object_xref_count += row_count
+
+        logging.info(
+            f"Number of rows: moved = {move_count}, identity_xrefs deleted = {del_identity_xref_count}, object_xrefs deleted = {del_object_xref_count}"
+        )
+
+        max_object_xref_id = xref_dbi.execute(
+            select(func.max(ObjectXrefUORM.object_xref_id))
+        ).scalar()
+        max_object_xref_id = int(max_object_xref_id)
+
+        if not max_object_xref_id:
+            raise LookupError("Problem getting max object_xref_id")
+
+        added_count, ignored = 0, 0
+
+        # Copy the xref data related to the reference gene onto the alt alleles
+        for ref_gene, alts in ref_to_alts.items():
+            # Get object and identity xref data related to the reference gene
+            query = (
+                select(ObjectXrefUORM, IdentityXrefUORM)
+                .outerjoin(
+                    IdentityXrefUORM,
+                    IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+                )
+                .where(
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                    ObjectXrefUORM.ensembl_id == ref_gene,
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                    ObjectXrefUORM.ensembl_object_type == "Gene",
+                    SourceUORM.name.in_(gene_specific_list),
+                )
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                for alt in alts:
+                    max_object_xref_id += 1
+
+                    query = insert(ObjectXrefUORM).values(
+                        object_xref_id=max_object_xref_id,
+                        ensembl_id=alt,
+                        ensembl_object_type=row.ensembl_object_type,
+                        xref_id=row.xref_id,
+                        linkage_annotation=row.linkage_annotation,
+                        linkage_type=row.linkage_type,
+                        ox_status=row.ox_status,
+                        unused_priority=row.unused_priority,
+                        master_xref_id=row.master_xref_id,
+                    )
+                    row_count = xref_dbi.execute(query).rowcount
+
+                    # Only add identity xref if object_xref was added successfully
+                    if row_count:
+                        added_count += 1
+
+                        query = insert(IdentityXrefUORM).values(
+                            object_xref_id=max_object_xref_id,
+                            query_identity=row.query_identity,
+                            target_identity=row.target_identity,
+                            hit_start=row.hit_start,
+                            hit_end=row.hit_end,
+                            translation_start=row.translation_start,
+                            translation_end=row.translation_end,
+                            cigar_line=row.cigar_line,
+                            score=row.score,
+                            evalue=row.evalue,
+                        )
+                        xref_dbi.execute(query)
+                    else:
+                        ignored += 1
+
+        logging.info(f"Added {added_count} new mappings and ignored {ignored}")
+
+        if self.unlinked_entries(verbose, xref_dbi):
+            raise ValueError("Problems found after process_alt_alleles")
+
+        xref_dbi.close()
+
+        self.update_process_status("alt_alleles_processed")
+
+    def get_alt_allele_hashes(self, dbi: Connection) -> Tuple[Dict[int, int], Dict[int, List[int]]]:
+        alt_to_ref, ref_to_alts = {}, {}
+        last_alt_allele, ref_gene = 0, None
+
+        query = select(
+            AltAlleleUORM.alt_allele_id,
+            AltAlleleUORM.gene_id,
+            AltAlleleUORM.is_reference,
+        ).order_by(AltAlleleUORM.alt_allele_id, AltAlleleUORM.is_reference.desc())
+        for row in dbi.execute(query).mappings().all():
+            if row.alt_allele_id != last_alt_allele:
+                # Use the first non-reference gene if there is no reference gene in an alt_allele
+                ref_gene = row.gene_id
+            else:
+                alt_to_ref[row.gene_id] = ref_gene
+                ref_to_alts.setdefault(ref_gene, []).append(row.gene_id)
+
+            last_alt_allele = row.alt_allele_id
+
+        return alt_to_ref, ref_to_alts
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py
new file mode 100644
index 000000000..0dcbfdff4
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py
@@ -0,0 +1,248 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing paired xrefs."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class ProcessPaired(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def process(self) -> None:
+        logging.info("Processing paired xrefs")
+
+        xref_dbi = self.xref().connect()
+
+        object_xref_id = None
+        change = {
+            "translation object xrefs added": 0,
+            "translation object xrefs removed": 0,
+        }
+        RefSeq_pep_translation = {}
+
+        # Get the transcript RefSeq_mRNA% object xrefs, and the paired RefSeq_peptide% accessions as well as the translation id for the transcript
+        query = (
+            select(
+                ObjectXrefUORM.object_xref_id,
+                GeneTranscriptTranslationORM.translation_id,
+                PairsORM.source_id,
+                PairsORM.accession1,
+                IdentityXrefUORM.query_identity,
+                IdentityXrefUORM.target_identity,
+            )
+            .join(
+                XrefUORM,
+                (XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+                & (ObjectXrefUORM.ox_status == "DUMP_OUT"),
+            )
+            .join(
+                SourceUORM,
+                (SourceUORM.source_id == XrefUORM.source_id)
+                & (SourceUORM.name.like("RefSeq_mRNA%")),
+            )
+            .join(PairsORM, PairsORM.accession2 == XrefUORM.accession)
+            .join(
+                GeneTranscriptTranslationORM,
+                GeneTranscriptTranslationORM.transcript_id == ObjectXrefUORM.ensembl_id,
+            )
+            .join(
+                IdentityXrefUORM,
+                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+            )
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            # Check if translation is linked to the paired RefSeq peptide
+            if row.translation_id:
+                query = (
+                    select(ObjectXrefUORM.object_xref_id, ObjectXrefUORM.xref_id)
+                    .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+                    .where(
+                        ObjectXrefUORM.ox_status.in_(["DUMP_OUT", "FAILED_PRIORITY"]),
+                        ObjectXrefUORM.ensembl_object_type == "Translation",
+                        ObjectXrefUORM.ensembl_id == row.translation_id,
+                        XrefUORM.source_id == row.source_id,
+                        XrefUORM.accession == row.accession1,
+                    )
+                )
+                result = xref_dbi.execute(query)
+                if result.rowcount > 0:
+                    object_xref_row = result.mappings().all()[0]
+                    transl_object_xref_id = object_xref_row.object_xref_id
+                else:
+                    transl_object_xref_id = None
+
+                # If it's already linked we don't have to do anything
+                if not transl_object_xref_id:
+                    # Get the associated xref ID
+                    xref_id = xref_dbi.execute(
+                        select(XrefUORM.xref_id).where(
+                            XrefUORM.accession == row.accession1,
+                            XrefUORM.source_id == row.source_id,
+                        )
+                    ).scalar()
+
+                    if not xref_id:
+                        raise LookupError(
+                            f"Xref not found for accession {row.accession1} source_id {row.source_id}"
+                        )
+
+                    # Add a new object xref
+                    object_xref_id = self.add_object_xref(
+                        row.translation_id,
+                        xref_id,
+                        "Translation",
+                        "INFERRED_PAIR",
+                        xref_dbi,
+                        None,
+                        "DUMP_OUT",
+                    )
+
+                    # Update info type for xref
+                    xref_dbi.execute(
+                        update(XrefUORM)
+                        .where(XrefUORM.xref_id == xref_id)
+                        .values(info_type="INFERRED_PAIR")
+                    )
+
+                    # Also insert into identity_xref if needed
+                    if row.query_identity and row.target_identity:
+                        xref_dbi.execute(
+                            insert(IdentityXrefUORM).values(
+                                object_xref_id=object_xref_id,
+                                query_identity=row.query_identity,
+                                target_identity=row.target_identity,
+                            )
+                        )
+
+                    change["translation object xrefs added"] += 1
+                    transl_object_xref_id = object_xref_id
+
+                if transl_object_xref_id:
+                    RefSeq_pep_translation.setdefault(row.accession1, []).append(
+                        row.translation_id
+                    )
+
+        # Go through RefSeq_peptide% object_xrefs
+        query = (
+            select(
+                ObjectXrefUORM.object_xref_id,
+                ObjectXrefUORM.ensembl_id,
+                XrefUORM.accession,
+                GeneTranscriptTranslationORM.transcript_id,
+            )
+            .join(
+                ObjectXrefUORM,
+                (
+                    ObjectXrefUORM.ensembl_id
+                    == GeneTranscriptTranslationORM.translation_id
+                )
+                & (ObjectXrefUORM.ensembl_object_type == "Translation"),
+            )
+            .join(
+                XrefUORM,
+                (XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+                & (ObjectXrefUORM.ox_status == "DUMP_OUT")
+                & (ObjectXrefUORM.ensembl_object_type == "Translation"),
+            )
+            .join(
+                SourceUORM,
+                (SourceUORM.source_id == XrefUORM.source_id)
+                & (SourceUORM.name.like("RefSeq_peptide%")),
+            )
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if RefSeq_pep_translation.get(row.accession):
+                found = 0
+                for tr_id in RefSeq_pep_translation[row.accession]:
+                    if tr_id == row.ensembl_id:
+                        found = 1
+
+                if not found:
+                    # This translations's transcript is not matched with the paired RefSeq_mRNA%,
+                    # change the status to 'MULTI_DELETE'
+                    self.update_object_xref_status(
+                        row.object_xref_id, "MULTI_DELETE", xref_dbi
+                    )
+
+                    # Process all dependent xrefs as well
+                    self.process_dependents(
+                        row.object_xref_id, row.ensembl_id, row.transcript_id, xref_dbi
+                    )
+
+                    change["translation object xrefs removed"] += 1
+
+        for key, val in change.items():
+            logging.info(f"{key}:\t{val}")
+
+        xref_dbi.close()
+
+        self.update_process_status("processed_pairs")
+
+    def process_dependents(self, translation_object_xref_id: int, translation_id: int, transcript_id: int, dbi: Connection) -> None:
+        master_object_xrefs = []
+        new_master_object_xref_id = None
+        master_object_xref_ids = {}
+
+        master_object_xrefs.append(translation_object_xref_id)
+        master_object_xref_ids[translation_object_xref_id] = 1
+
+        while master_object_xrefs:
+            master_object_xref_id = master_object_xrefs.pop()
+            dependent_object_xref_id = None
+
+            MasterObjectXref = aliased(ObjectXrefUORM)
+            DependentObjectXref = aliased(ObjectXrefUORM)
+
+            MasterXref = aliased(XrefUORM)
+            DependentXref = aliased(XrefUORM)
+
+            query = select(DependentObjectXref.object_xref_id.distinct()).where(
+                DependentXref.xref_id == DependentXrefUORM.dependent_xref_id,
+                MasterXref.xref_id == DependentXrefUORM.master_xref_id,
+                DependentXref.xref_id == DependentObjectXref.xref_id,
+                MasterXref.xref_id == MasterObjectXref.xref_id,
+                MasterObjectXref.object_xref_id == master_object_xref_id,
+                DependentObjectXref.master_xref_id == MasterXref.xref_id,
+                DependentObjectXref.ensembl_id == translation_id,
+                DependentObjectXref.ensembl_object_type == "Translation",
+                DependentObjectXref.ox_status == "DUMP_OUT",
+            )
+            for row in dbi.execute(query).mappings().all():
+                self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi)
+
+                if not master_object_xref_ids.get(row.object_xref_id):
+                    master_object_xref_ids[row.object_xref_id] = 1
+                    master_object_xrefs.append(row.object_xref_id)
+
+            query = select(DependentObjectXref.object_xref_id.distinct()).where(
+                DependentXref.xref_id == DependentXrefUORM.dependent_xref_id,
+                MasterXref.xref_id == DependentXrefUORM.master_xref_id,
+                DependentXref.xref_id == DependentObjectXref.xref_id,
+                MasterXref.xref_id == MasterObjectXref.xref_id,
+                MasterObjectXref.object_xref_id == master_object_xref_id,
+                DependentObjectXref.master_xref_id == MasterXref.xref_id,
+                DependentObjectXref.ensembl_id == transcript_id,
+                DependentObjectXref.ensembl_object_type == "Transcript",
+                DependentObjectXref.ox_status == "DUMP_OUT",
+            )
+            for row in dbi.execute(query).mappings().all():
+                self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi)
+
+                if not master_object_xref_ids.get(row.object_xref_id):
+                    master_object_xref_ids[row.object_xref_id] = 1
+                    master_object_xrefs.append(row.object_xref_id)
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py
new file mode 100644
index 000000000..ba212ddf6
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py
@@ -0,0 +1,408 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing xref priorities."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class ProcessPriorities(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def process(self) -> None:
+        logging.info("Processing priorities")
+
+        xref_dbi = self.xref().connect()
+
+        names = self.get_priority_names(xref_dbi)
+
+        for name in names:
+            logging.info(f"'{name}' will be processed as priority xrefs")
+
+            # Set to failed all those that have no object xrefs
+            query = (
+                select(XrefUORM.xref_id)
+                .outerjoin(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id)
+                .where(
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    SourceUORM.name == name,
+                    ObjectXrefUORM.object_xref_id == None,
+                )
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                self.update_xref_dumped(
+                    row.xref_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
+                )
+
+        # Now ALL object_xrefs have an identity_xref
+        # So we can do a straight join and treat all info_types the same way
+        for name in names:
+            last_acc, last_name, best_xref_id, last_xref_id, seen = "", "", None, 0, 0
+            best_ensembl_id, gone = [], []
+
+            query = (
+                select(
+                    ObjectXrefUORM.object_xref_id,
+                    XrefUORM.accession,
+                    XrefUORM.xref_id,
+                    (
+                        IdentityXrefUORM.query_identity
+                        + IdentityXrefUORM.target_identity
+                    ).label("identity"),
+                    ObjectXrefUORM.ox_status,
+                    ObjectXrefUORM.ensembl_object_type,
+                    ObjectXrefUORM.ensembl_id,
+                    XrefUORM.info_type,
+                )
+                .where(
+                    ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
+                    ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                    XrefUORM.source_id == SourceUORM.source_id,
+                    SourceUORM.name == name,
+                )
+                .order_by(
+                    XrefUORM.accession.desc(),
+                    SourceUORM.priority,
+                    desc("identity"),
+                    XrefUORM.xref_id.desc(),
+                )
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                if last_acc == row.accession:
+                    if row.xref_id != best_xref_id:
+                        # We've already seen this accession before, and this xref_id is not the best one
+                        seen = row.xref_id == last_xref_id
+                        last_xref_id = row.xref_id
+
+                        # If xref is a sequence_match, we want to copy the alignment identity_xref to prioritised mappings of the same ensembl_id
+                        if row.info_type == "SEQUENCE_MATCH":
+                            identity_xref_row, object_xref_row = None, None
+
+                            query = select(IdentityXrefUORM).where(
+                                IdentityXrefUORM.object_xref_id == row.object_xref_id
+                            )
+                            result = xref_dbi.execute(query)
+                            if result.rowcount > 0:
+                                identity_xref_row = result.mappings().all()[0]
+
+                            query = select(ObjectXrefUORM.object_xref_id).where(
+                                ObjectXrefUORM.xref_id == best_xref_id,
+                                ObjectXrefUORM.ensembl_object_type
+                                == row.ensembl_object_type,
+                                ObjectXrefUORM.ensembl_id == row.ensembl_id,
+                            )
+                            result = xref_dbi.execute(query)
+                            if result.rowcount > 0:
+                                object_xref_row = result.mappings().all()[0]
+
+                            if identity_xref_row and object_xref_row:
+                                query = (
+                                    update(IdentityXrefUORM)
+                                    .where(
+                                        IdentityXrefUORM.object_xref_id
+                                        == object_xref_row.object_xref_id
+                                    )
+                                    .values(
+                                        query_identity=identity_xref_row.query_identity,
+                                        target_identity=identity_xref_row.target_identity,
+                                        hit_start=identity_xref_row.hit_start,
+                                        hit_end=identity_xref_row.hit_end,
+                                        translation_start=identity_xref_row.translation_start,
+                                        translation_end=identity_xref_row.translation_end,
+                                        cigar_line=identity_xref_row.cigar_line,
+                                        score=identity_xref_row.score,
+                                        evalue=identity_xref_row.evalue,
+                                    )
+                                )
+                                xref_dbi.execute(query)
+
+                        # If the xref is marked DUMP_OUT, set it to FAILED_PRIORITY
+                        if row.ox_status == "DUMP_OUT":
+                            xref_dbi.execute(
+                                update(ObjectXrefUORM)
+                                .where(
+                                    ObjectXrefUORM.object_xref_id == row.object_xref_id
+                                )
+                                .values(ox_status="FAILED_PRIORITY")
+                            )
+
+                            # If it is the first time processing this xref_id, also process dependents and update status
+                            if not seen:
+                                self.update_xref_dumped(
+                                    row.xref_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
+                                )
+
+                                # Copy synonyms across if they are missing
+                                query = select(SynonymORM.synonym).where(
+                                    SynonymORM.xref_id == row.xref_id
+                                )
+                                for synonym_row in (
+                                    xref_dbi.execute(query).mappings().all()
+                                ):
+                                    xref_dbi.execute(
+                                        insert(SynonymORM)
+                                        .values(
+                                            xref_id=best_xref_id,
+                                            synonym=synonym_row.synonym,
+                                        )
+                                        .prefix_with("IGNORE")
+                                    )
+
+                                self.process_dependents(
+                                    row.xref_id, best_xref_id, xref_dbi
+                                )
+                        else:
+                            # Status is not DUMP_OUT
+                            self.update_xref_dumped(
+                                row.xref_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
+                            )
+                    else:
+                        # Alignment did not pass, dismiss
+                        if row.ox_status == "FAILED_CUTOFF":
+                            continue
+
+                        # There might be several mappings for the best priority
+                        best_ensembl_id.append(row.ensembl_id)
+
+                    # Best priority failed so another one now found so set dumped
+                    if len(gone) > 0:
+                        if last_name == row.accession:
+                            for x_id in gone:
+                                self.update_xref_dumped(
+                                    x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
+                                )
+                else:
+                    # New xref_id
+                    if row.ox_status == "DUMP_OUT":
+                        last_acc = row.accession
+                        best_xref_id = row.xref_id
+                        best_ensembl_id = [row.ensembl_id]
+
+                        if len(gone) > 0 and last_name == row.accession:
+                            for x_id in gone:
+                                self.update_xref_dumped(
+                                    x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
+                                )
+                            gone = []
+                    else:
+                        # New xref_id not DUMP_OUT
+                        if last_name != row.accession:
+                            gone = []
+
+                        gone.append(row.xref_id)
+                        last_name = row.accession
+
+        xref_dbi.close()
+
+        self.update_process_status("priorities_flagged")
+
+    def get_priority_names(self, dbi: Connection) -> List[str]:
+        names = []
+        seen = {}
+        last_name = "rubbish"
+
+        query = (
+            select(
+                SourceUORM.priority_description.label("description"), SourceUORM.name
+            )
+            .where(SourceUORM.source_id == XrefUORM.source_id)
+            .group_by(SourceUORM.priority_description, SourceUORM.name)
+            .order_by(SourceUORM.name)
+        )
+        for row in dbi.execute(query).mappings().all():
+            if row.name == last_name and not seen.get(row.name):
+                names.append(row.name)
+                seen[row.name] = 1
+            last_name = row.name
+
+        return names
+
+    def update_xref_dumped(self, xref_id: int, dumped: str, dbi: Connection) -> None:
+        dbi.execute(
+            update(XrefUORM).where(XrefUORM.xref_id == xref_id).values(dumped=dumped)
+        )
+
+    def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, dbi: Connection) -> None:
+        master_xrefs = [old_master_xref_id]
+        recursive = 0
+
+        # Create a hash of all possible mappings for this accession
+        ensembl_ids = {}
+        query = (
+            select(
+                ObjectXrefUORM.ensembl_object_type.distinct(), ObjectXrefUORM.ensembl_id
+            )
+            .where(
+                ObjectXrefUORM.ox_status != "FAILED_CUTOFF",
+                ObjectXrefUORM.xref_id == new_master_xref_id,
+            )
+            .order_by(ObjectXrefUORM.ensembl_object_type)
+        )
+        for row in dbi.execute(query).mappings().all():
+            ensembl_ids.setdefault(row.ensembl_object_type, []).append(row.ensembl_id)
+
+        old_ensembl_ids = {}
+        query = (
+            select(
+                ObjectXrefUORM.ensembl_object_type.distinct(), ObjectXrefUORM.ensembl_id
+            )
+            .where(
+                ObjectXrefUORM.ox_status != "FAILED_CUTOFF",
+                ObjectXrefUORM.xref_id == old_master_xref_id,
+            )
+            .order_by(ObjectXrefUORM.ensembl_object_type)
+        )
+        for row in dbi.execute(query).mappings().all():
+            old_ensembl_ids.setdefault(row.ensembl_object_type, []).append(
+                row.ensembl_id
+            )
+
+        # Loop through all dependent xrefs of old master xref, and recurse
+        while master_xrefs:
+            xref_id = master_xrefs.pop()
+
+            if recursive:
+                new_master_xref_id = xref_id
+
+            # Get dependent xrefs, be they gene, transcript or translation
+            query = (
+                select(
+                    DependentXrefUORM.dependent_xref_id.distinct(),
+                    DependentXrefUORM.linkage_annotation,
+                    DependentXrefUORM.linkage_source_id,
+                    ObjectXrefUORM.ensembl_object_type,
+                )
+                .where(
+                    ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
+                    ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id,
+                    DependentXrefUORM.master_xref_id == xref_id,
+                )
+                .order_by(ObjectXrefUORM.ensembl_object_type)
+            )
+            for row in dbi.execute(query).mappings().all():
+                # Remove all mappings to low priority xrefs
+                # Then delete any leftover identity xrefs of it
+                for ensembl_id in old_ensembl_ids.get(row.ensembl_object_type):
+                    self._detach_object_xref(
+                        xref_id,
+                        row.dependent_xref_id,
+                        row.ensembl_object_type,
+                        ensembl_id,
+                        dbi,
+                    )
+
+                # Duplicate each dependent for the new master xref if it is the first in the chain
+                if not recursive:
+                    dbi.execute(
+                        insert(DependentXrefUORM)
+                        .values(
+                            master_xref_id=new_master_xref_id,
+                            dependent_xref_id=row.dependent_xref_id,
+                            linkage_annotation=row.linkage_annotation,
+                            linkage_source_id=row.linkage_source_id,
+                        )
+                        .prefix_with("IGNORE")
+                    )
+
+                # Loop through all chosen (best) ensembl ids mapped to priority xref, and connect them with object_xrefs
+                for ensembl_id in ensembl_ids.get(row.ensembl_object_type):
+                    # Add new object_xref for each best_ensembl_id
+                    dbi.execute(
+                        insert(ObjectXrefUORM)
+                        .values(
+                            master_xref_id=new_master_xref_id,
+                            ensembl_object_type=row.ensembl_object_type,
+                            ensembl_id=ensembl_id,
+                            linkage_type="DEPENDENT",
+                            ox_status="DUMP_OUT",
+                            xref_id=row.dependent_xref_id,
+                        )
+                        .prefix_with("IGNORE")
+                    )
+
+                    # Get inserted ID
+                    query = select(ObjectXrefUORM.object_xref_id).where(
+                        ObjectXrefUORM.master_xref_id == new_master_xref_id,
+                        ObjectXrefUORM.ensembl_object_type == row.ensembl_object_type,
+                        ObjectXrefUORM.ensembl_id == ensembl_id,
+                        ObjectXrefUORM.linkage_type == "DEPENDENT",
+                        ObjectXrefUORM.ox_status == "DUMP_OUT",
+                        ObjectXrefUORM.xref_id == row.dependent_xref_id,
+                    )
+                    for object_xref_row in dbi.execute(query).mappings().all():
+                        dbi.execute(
+                            insert(IdentityXrefUORM)
+                            .values(
+                                object_xref_id=object_xref_row.object_xref_id,
+                                query_identity=100,
+                                target_identity=100,
+                            )
+                            .prefix_with("IGNORE")
+                        )
+
+                if row.dependent_xref_id != xref_id:
+                    master_xrefs.append(row.dependent_xref_id)
+
+            recursive = 1
+
+    def _detach_object_xref(self, xref_id: int, dependent_xref_id: int, object_type: str, ensembl_id: int, dbi: Connection) -> None:
+        # Drop all the identity and go xrefs for the dependents of an xref
+        query = (
+            select(ObjectXrefUORM.object_xref_id)
+            .outerjoin(
+                IdentityXrefUORM,
+                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+            )
+            .where(
+                ObjectXrefUORM.master_xref_id == xref_id,
+                ObjectXrefUORM.ensembl_object_type == object_type,
+                ObjectXrefUORM.xref_id == dependent_xref_id,
+                ObjectXrefUORM.ensembl_id == ensembl_id,
+            )
+        )
+        result = dbi.execute(query).fetchall()
+        object_xref_ids = [row[0] for row in result]
+
+        dbi.execute(
+            delete(IdentityXrefUORM).where(
+                IdentityXrefUORM.object_xref_id.in_(object_xref_ids)
+            )
+        )
+
+        # Change status of object_xref to FAILED_PRIORITY for record keeping
+        dbi.execute(
+            update(ObjectXrefUORM)
+            .where(
+                ObjectXrefUORM.master_xref_id == xref_id,
+                ObjectXrefUORM.ensembl_object_type == object_type,
+                ObjectXrefUORM.xref_id == dependent_xref_id,
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                ObjectXrefUORM.ensembl_id == ensembl_id,
+            )
+            .values(ox_status="FAILED_PRIORITY")
+        )
+
+        # Delete the duplicates
+        dbi.execute(
+            delete(ObjectXrefUORM).where(
+                ObjectXrefUORM.master_xref_id == xref_id,
+                ObjectXrefUORM.ensembl_object_type == object_type,
+                ObjectXrefUORM.xref_id == dependent_xref_id,
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                ObjectXrefUORM.ensembl_id == ensembl_id,
+            )
+        )
diff --git a/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py
new file mode 100644
index 000000000..473af5a69
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py
@@ -0,0 +1,28 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing RNACentral xref data."""
+
+from ensembl.production.xrefs.mappers.ChecksumMapper import *
+
+
+class RNACentralMapper(ChecksumMapper):
+    def target(self) -> str:
+        return self.mapper().dna_file()
+
+    def external_db_name(self) -> str:
+        return "RNAcentral"
+
+    def object_type(self) -> str:
+        return "Transcript"
diff --git a/src/python/ensembl/production/xrefs/mappers/TestMappings.py b/src/python/ensembl/production/xrefs/mappers/TestMappings.py
new file mode 100644
index 000000000..4511741d1
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/TestMappings.py
@@ -0,0 +1,199 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for running validity checks on xref data."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class TestMappings(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def direct_stable_id_check(self) -> int:
+        xref_dbi = self.xref().connect()
+
+        db_tables = {
+            "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM},
+            "transcript": {
+                "direct": TranscriptDirectXrefORM,
+                "stable_id": TranscriptStableIdORM,
+            },
+            "translation": {
+                "direct": TranslationDirectXrefORM,
+                "stable_id": TranslationStableIdORM,
+            },
+        }
+
+        total_warnings_count = 0
+
+        for object_type in ["gene", "transcript", "translation"]:
+            warnings_count = 0
+            direct_table = db_tables[object_type]["direct"]
+            stable_id_table = db_tables[object_type]["stable_id"]
+
+            query = (
+                select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count"))
+                .join(XrefUORM, SourceUORM.source_id == XrefUORM.source_id)
+                .join(direct_table, XrefUORM.xref_id == direct_table.general_xref_id)
+                .outerjoin(
+                    stable_id_table,
+                    stable_id_table.stable_id == direct_table.ensembl_stable_id,
+                )
+                .where(stable_id_table.stable_id == None)
+                .group_by(SourceUORM.name)
+            )
+            for row in xref_dbi.execute(query).mappings().all():
+                logging.warn(
+                    f"{row.name} has {row.count} invalid stable IDs in {object_type}_direct_xref"
+                )
+                warnings_count += 1
+
+            total_warnings_count += warnings_count
+
+        xref_dbi.close()
+
+        self.update_process_status("direct_stable_id_check_done")
+
+        return total_warnings_count
+
+    def xrefs_counts_check(self) -> int:
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        warnings_count = 0
+        core_count, xref_count = {}, {}
+
+        # TO DO: sqlalchemy syntax -- can't figure out how to count 2 columns
+        xref_query = f'SELECT s.name, COUNT(DISTINCT x.xref_id, ox.ensembl_id) AS count FROM xref x, object_xref ox, source s WHERE ox.xref_id = x.xref_id AND x.source_id = s.source_id AND ox_status = "DUMP_OUT" GROUP BY s.name'
+        for row in xref_dbi.execute(text(xref_query)).mappings().all():
+            xref_count[row.name] = row.count
+
+        query = (
+            select(
+                ExternalDbORM.db_name,
+                func.count(ObjectXrefCORM.object_xref_id).label("count"),
+            )
+            .where(
+                XrefCORM.xref_id == ObjectXrefCORM.xref_id,
+                XrefCORM.external_db_id == ExternalDbORM.external_db_id,
+            )
+            .filter((XrefCORM.info_type == None) | (XrefCORM.info_type != "PROJECTION"))
+            .group_by(ExternalDbORM.db_name)
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            change = 0
+            core_count[row.db_name] = row.count
+
+            if xref_count.get(row.db_name):
+                change = ((xref_count[row.db_name] - row.count) / row.count) * 100
+
+                if change > 5:
+                    logging.warn(
+                        f"{row.db_name} has increased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB"
+                    )
+                    warnings_count += 1
+                elif change < -5:
+                    logging.warn(
+                        f"{row.db_name} has decreased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB"
+                    )
+                    warnings_count += 1
+            else:
+                logging.warn(
+                    f"{row.db_name} xrefs are not in the xref DB but {row.count} are in the core DB"
+                )
+                warnings_count += 1
+
+        for name, count in xref_count.items():
+            if not core_count.get(name):
+                logging.warn(
+                    f"{name} has {count} xrefs in the xref DB but none in the core DB"
+                )
+                warnings_count += 1
+
+        xref_dbi.close()
+        core_dbi.close()
+
+        self.update_process_status("xrefs_counts_check_done")
+
+        return warnings_count
+
+    def name_change_check(self, official_name: str = None) -> int:
+        if not official_name:
+            return 0
+
+        new_name, id_to_stable_id, alias = {}, {}, {}
+        warnings_count, total_count = 0, 0
+
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        query = select(
+            XrefUORM.label, GeneStableIdORM.internal_id, GeneStableIdORM.stable_id
+        ).where(
+            XrefUORM.xref_id == ObjectXrefUORM.object_xref_id,
+            ObjectXrefUORM.ensembl_object_type == "Gene",
+            GeneStableIdORM.internal_id == ObjectXrefUORM.ensembl_id,
+            XrefUORM.source_id == SourceUORM.source_id,
+            SourceUORM.name.like(f"{official_name}_%"),
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            new_name[row.internal_id] = row.label
+            id_to_stable_id[row.internal_id] = row.stable_id
+
+        query = (
+            select(XrefUORM.label, SynonymORM.synonym)
+            .where(
+                XrefUORM.xref_id == SynonymORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+            )
+            .filter(
+                (SourceUORM.name.like(f"{official_name}_%"))
+                | (SourceUORM.name.like("EntrezGene"))
+            )
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            alias[row.synonym] = row.label
+
+        query = select(XrefCORM.display_label, GeneORM.gene_id).where(
+            XrefCORM.xref_id == GeneORM.display_xref_id,
+            GeneORM.biotype == "protein_coding",
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            if new_name.get(row.gene_id):
+                total_count += 1
+
+            if new_name.get(row.gene_id) and new_name[row.gene_id] != row.display_label:
+                if (
+                    not alias.get(row.display_label)
+                    or alias.get(row.display_label) != new_name[row.gene_id]
+                ):
+                    logging.warn(
+                        f"gene ID ({row.gene_id}) {id_to_stable_id[row.gene_id]} new = {new_name[row.gene_id]} old = {row.display_label}"
+                    )
+                    warnings_count += 1
+
+        if total_count:
+            logging.warn(
+                f"{warnings_count} entries with different names out of {total_count} protein coding gene comparisons"
+            )
+
+        xref_dbi.close()
+        core_dbi.close()
+
+        self.update_process_status("name_change_check_done")
+
+        return warnings_count
diff --git a/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py
new file mode 100644
index 000000000..f518303bb
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py
@@ -0,0 +1,28 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for processing UniParc xref data."""
+
+from ensembl.production.xrefs.mappers.ChecksumMapper import *
+
+
+class UniParcMapper(ChecksumMapper):
+    def target(self) -> str:
+        return self.mapper().protein_file()
+
+    def external_db_name(self) -> str:
+        return "UniParc"
+
+    def object_type(self) -> str:
+        return "Translation"
diff --git a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
new file mode 100644
index 000000000..be634d870
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
@@ -0,0 +1,804 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper module for loading xref data into the core DB."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class XrefLoader(BasicMapper):
+    def __init__(self, mapper: BasicMapper) -> None:
+        self.xref(mapper.xref())
+        self.core(mapper.core())
+        mapper.set_up_logging()
+
+    def update(self, species_name: str) -> None:
+        logging.info("Loading xrefs into core DB")
+
+        xref_dbi = self.xref().connect()
+        core_dbi = self.core().connect()
+
+        # Delete xref data related to projections
+        self.delete_projection_data(core_dbi)
+
+        # Get the source IDs of relevant external DBs
+        name_to_external_db_id, source_id_to_external_db_id = {}, {}
+
+        query = select(ExternalDbORM.external_db_id, ExternalDbORM.db_name)
+        for row in core_dbi.execute(query).mappings().all():
+            name_to_external_db_id[row.db_name] = row.external_db_id
+
+        query = (
+            select(SourceUORM.source_id, SourceUORM.name)
+            .where(SourceUORM.source_id == XrefUORM.source_id)
+            .group_by(SourceUORM.source_id)
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if name_to_external_db_id.get(row.name):
+                source_id_to_external_db_id[row.source_id] = name_to_external_db_id[
+                    row.name
+                ]
+            elif re.search(r"notransfer$", row.name):
+                continue
+            else:
+                raise LookupError(
+                    f"Could not find {row.name} in external_db table in the core DB"
+                )
+
+        # Reset dumped field in case module is running again
+        xref_dbi.execute(
+            update(XrefUORM)
+            .values(dumped=None)
+            .where(XrefUORM.dumped != "NO_DUMP_ANOTHER_PRIORITY")
+        )
+
+        # Delete existing xrefs in core DB (only from relevant sources)
+        self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi, core_dbi)
+
+        # Get the offsets for xref and object_xref tables
+        # This is used to track the xrefs whe mapping onto the core DB
+        xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
+        object_xref_offset = core_dbi.execute(
+            select(func.max(ObjectXrefCORM.object_xref_id))
+        ).scalar()
+
+        if not xref_offset:
+            xref_offset = 0
+        else:
+            xref_offset = int(xref_offset)
+        self.add_meta_pair("xref_offset", xref_offset)
+        if not object_xref_offset:
+            object_xref_offset = 0
+        else:
+            object_xref_offset = int(object_xref_offset)
+        self.add_meta_pair("object_xref_offset", object_xref_offset)
+
+        logging.info(
+            f"DB offsets: xref={xref_offset}, object_xref={object_xref_offset}"
+        )
+
+        # Get analysis IDs
+        analysis_ids = self.get_analysis(core_dbi)
+
+        # Prepare some queries
+        xref_object_query = (
+            select(XrefUORM, ObjectXrefUORM)
+            .where(
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+            )
+            .order_by(XrefUORM.xref_id)
+        )
+        xref_object_identity_query = (
+            select(XrefUORM, ObjectXrefUORM, IdentityXrefUORM)
+            .where(
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+            )
+            .order_by(XrefUORM.xref_id)
+        )
+
+        #### TO DO: transaction
+
+        # Get source info from xref DB
+        query = (
+            select(
+                SourceUORM.source_id,
+                SourceUORM.name,
+                XrefUORM.info_type,
+                func.count(XrefUORM.xref_id).label("count"),
+                SourceUORM.priority_description,
+                SourceUORM.source_release,
+            )
+            .where(
+                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+            )
+            .group_by(SourceUORM.source_id, SourceUORM.name, XrefUORM.info_type)
+        )
+        for source_row in xref_dbi.execute(query).mappings().all():
+            # We only care about specific sources
+            if not name_to_external_db_id.get(source_row.name):
+                continue
+            logging.info(
+                f"Updating source '{source_row.name}' ({source_row.source_id}) in core"
+            )
+
+            where_from = source_row.priority_description
+            if where_from:
+                where_from = f"Generated via {where_from}"
+
+            external_id = name_to_external_db_id[source_row.name]
+            xref_list = []
+
+            if (
+                source_row.info_type == "DIRECT"
+                or source_row.info_type == "INFERRED_PAIR"
+                or source_row.info_type == "MISC"
+            ):
+                count, last_xref_id = 0, 0
+
+                # Get all direct, inferred pair and misc xrefs from intermediate DB
+                query = xref_object_identity_query.where(
+                    XrefUORM.source_id == source_row.source_id,
+                    XrefUORM.info_type == source_row.info_type,
+                )
+                for xref_row in xref_dbi.execute(query).mappings().all():
+                    xref_id = int(xref_row.xref_id)
+                    object_xref_id = int(xref_row.object_xref_id)
+
+                    if last_xref_id != xref_id:
+                        xref_list.append(xref_id)
+                        count += 1
+
+                        # Add xref into core DB
+                        info_text = xref_row.info_text
+                        if not info_text:
+                            info_text = where_from
+                        xref_args = {
+                            "xref_id": xref_id,
+                            "accession": xref_row.accession,
+                            "external_db_id": external_id,
+                            "label": xref_row.label,
+                            "description": xref_row.description,
+                            "version": xref_row.version,
+                            "info_type": xref_row.info_type,
+                            "info_text": info_text,
+                        }
+                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
+                        last_xref_id = xref_id
+
+                    # Add object xref into core DB
+                    object_xref_args = {
+                        "object_xref_id": object_xref_id,
+                        "ensembl_id": xref_row.ensembl_id,
+                        "ensembl_type": xref_row.ensembl_object_type,
+                        "xref_id": xref_id + xref_offset,
+                        "analysis_id": analysis_ids[xref_row.ensembl_object_type],
+                    }
+                    object_xref_id = self.add_object_xref(
+                        object_xref_offset, object_xref_args, core_dbi
+                    )
+
+                    # Add identity xref into core DB
+                    if xref_row.translation_start:
+                        query = (
+                            insert(IdentityXrefCORM)
+                            .values(
+                                object_xref_id=object_xref_id + object_xref_offset,
+                                xref_identity=xref_row.query_identity,
+                                ensembl_identity=xref_row.target_identity,
+                                xref_start=xref_row.hit_start,
+                                xref_end=xref_row.hit_end,
+                                ensembl_start=xref_row.translation_start,
+                                ensembl_end=xref_row.translation_end,
+                                cigar_line=xref_row.cigar_line,
+                                score=xref_row.score,
+                                evalue=xref_row.evalue,
+                            )
+                            .prefix_with("IGNORE")
+                        )
+                        core_dbi.execute(query)
+
+                logging.info(
+                    f"\tLoaded {count} {source_row.info_type} xrefs for '{species_name}'"
+                )
+            elif source_row.info_type == "CHECKSUM":
+                count, last_xref_id = 0, 0
+
+                # Get all checksum xrefs from intermediate DB
+                query = xref_object_query.where(
+                    XrefUORM.source_id == source_row.source_id,
+                    XrefUORM.info_type == source_row.info_type,
+                )
+                for xref_row in xref_dbi.execute(query).mappings().all():
+                    xref_id = int(xref_row.xref_id)
+                    object_xref_id = int(xref_row.object_xref_id)
+
+                    if last_xref_id != xref_id:
+                        xref_list.append(xref_id)
+                        count += 1
+
+                        # Add xref into core DB
+                        info_text = xref_row.info_text
+                        if not info_text:
+                            info_text = where_from
+                        xref_args = {
+                            "xref_id": xref_id,
+                            "accession": xref_row.accession,
+                            "external_db_id": external_id,
+                            "label": xref_row.label,
+                            "description": xref_row.description,
+                            "version": xref_row.version,
+                            "info_type": xref_row.info_type,
+                            "info_text": info_text,
+                        }
+                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
+                        last_xref_id = xref_id
+
+                    # Add object xref into core DB
+                    object_xref_args = {
+                        "object_xref_id": object_xref_id,
+                        "ensembl_id": xref_row.ensembl_id,
+                        "ensembl_type": xref_row.ensembl_object_type,
+                        "xref_id": xref_id + xref_offset,
+                        "analysis_id": analysis_ids["checksum"],
+                    }
+                    object_xref_id = self.add_object_xref(
+                        object_xref_offset, object_xref_args, core_dbi
+                    )
+
+                logging.info(f"\tLoaded {count} CHECKSUM xrefs for '{species_name}'")
+            elif source_row.info_type == "DEPENDENT":
+                count, last_xref_id, last_ensembl_id, master_error_count = 0, 0, 0, 0
+                master_problems = []
+
+                # Get all dependent xrefs from intermediate DB
+                MasterXref = aliased(XrefUORM)
+                query = (
+                    select(XrefUORM, ObjectXrefUORM)
+                    .where(
+                        ObjectXrefUORM.ox_status == "DUMP_OUT",
+                        ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                        ObjectXrefUORM.master_xref_id == MasterXref.xref_id,
+                        MasterXref.source_id == SourceUORM.source_id,
+                        XrefUORM.source_id == source_row.source_id,
+                        XrefUORM.info_type == "DEPENDENT",
+                    )
+                    .order_by(
+                        XrefUORM.xref_id, ObjectXrefUORM.ensembl_id, SourceUORM.ordered
+                    )
+                )
+                for xref_row in xref_dbi.execute(query).mappings().all():
+                    xref_id = int(xref_row.xref_id)
+                    object_xref_id = int(xref_row.object_xref_id)
+
+                    if last_xref_id != xref_id:
+                        xref_list.append(xref_id)
+                        count += 1
+
+                        # Add xref into core DB
+                        label = xref_row.label
+                        if not label:
+                            label = xref_row.accession
+                        info_text = xref_row.info_text
+                        if not info_text:
+                            info_text = where_from
+                        xref_args = {
+                            "xref_id": xref_id,
+                            "accession": xref_row.accession,
+                            "external_db_id": external_id,
+                            "label": label,
+                            "description": xref_row.description,
+                            "version": xref_row.version,
+                            "info_type": xref_row.info_type,
+                            "info_text": info_text,
+                        }
+                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
+
+                    if (
+                        last_xref_id != xref_id
+                        or last_ensembl_id != xref_row.ensembl_id
+                    ):
+                        # Add object xref into core DB
+                        object_xref_args = {
+                            "object_xref_id": object_xref_id,
+                            "ensembl_id": xref_row.ensembl_id,
+                            "ensembl_type": xref_row.ensembl_object_type,
+                            "xref_id": xref_id + xref_offset,
+                            "analysis_id": analysis_ids[xref_row.ensembl_object_type],
+                        }
+                        object_xref_id = self.add_object_xref(
+                            object_xref_offset, object_xref_args, core_dbi
+                        )
+
+                        if xref_row.master_xref_id:
+                            # Add dependent xref into core DB
+                            core_dbi.execute(
+                                insert(DependentXrefCORM)
+                                .values(
+                                    object_xref_id=object_xref_id + object_xref_offset,
+                                    master_xref_id=xref_row.master_xref_id
+                                    + xref_offset,
+                                    dependent_xref_id=xref_id + xref_offset,
+                                )
+                                .prefix_with("IGNORE")
+                            )
+                        else:
+                            if master_error_count < 10:
+                                master_problems.append(xref_row.accession)
+
+                            master_error_count += 1
+
+                    last_xref_id = xref_id
+                    last_ensembl_id = xref_row.ensembl_id
+
+                if len(master_problems) > 0:
+                    logging.warn(
+                        f"For {source_row.name}, there were {master_error_count} problem master xrefs. Examples are: "
+                        + ", ".join(master_problems)
+                    )
+
+                logging.info(f"\tLoaded {count} DEPENDENT xrefs for '{species_name}'")
+            elif source_row.info_type == "SEQUENCE_MATCH":
+                count, last_xref_id = 0, 0
+
+                # Get all direct, inferred pair and misc xrefs from intermediate DB
+                query = xref_object_identity_query.where(
+                    XrefUORM.source_id == source_row.source_id,
+                    XrefUORM.info_type == source_row.info_type,
+                )
+                for xref_row in xref_dbi.execute(query).mappings().all():
+                    xref_id = int(xref_row.xref_id)
+                    object_xref_id = int(xref_row.object_xref_id)
+
+                    if last_xref_id != xref_id:
+                        xref_list.append(xref_id)
+                        count += 1
+
+                        # Add xref into core DB
+                        info_text = xref_row.info_text
+                        if not info_text:
+                            info_text = where_from
+                        xref_args = {
+                            "xref_id": xref_id,
+                            "accession": xref_row.accession,
+                            "external_db_id": external_id,
+                            "label": xref_row.label,
+                            "description": xref_row.description,
+                            "version": xref_row.version,
+                            "info_type": xref_row.info_type,
+                            "info_text": info_text,
+                        }
+                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
+                        last_xref_id = xref_id
+
+                    # Add object xref into core DB
+                    object_xref_args = {
+                        "object_xref_id": object_xref_id,
+                        "ensembl_id": xref_row.ensembl_id,
+                        "ensembl_type": xref_row.ensembl_object_type,
+                        "xref_id": xref_id + xref_offset,
+                        "analysis_id": analysis_ids[xref_row.ensembl_object_type],
+                    }
+                    object_xref_id = self.add_object_xref(
+                        object_xref_offset, object_xref_args, core_dbi
+                    )
+
+                    # Add identity xref into core DB
+                    query = (
+                        insert(IdentityXrefCORM)
+                        .values(
+                            object_xref_id=object_xref_id + object_xref_offset,
+                            xref_identity=xref_row.query_identity,
+                            ensembl_identity=xref_row.target_identity,
+                            xref_start=xref_row.hit_start,
+                            xref_end=xref_row.hit_end,
+                            ensembl_start=xref_row.translation_start,
+                            ensembl_end=xref_row.translation_end,
+                            cigar_line=xref_row.cigar_line,
+                            score=xref_row.score,
+                            evalue=xref_row.evalue,
+                        )
+                        .prefix_with("IGNORE")
+                    )
+                    core_dbi.execute(query)
+
+                logging.info(
+                    f"\tLoaded {count} SEQUENCE_MATCH xrefs for '{species_name}'"
+                )
+            else:
+                logging.debug(f"\tPROBLEM: what type is {source_row.info_type}")
+
+            # Transfer synonym data
+            if len(xref_list) > 0:
+                syn_count = 0
+
+                # Get synonyms
+                query = select(SynonymORM.xref_id, SynonymORM.synonym).where(
+                    SynonymORM.xref_id.in_(xref_list)
+                )
+                for syn_row in xref_dbi.execute(query).mappings().all():
+                    core_dbi.execute(
+                        insert(ExternalSynonymORM).values(
+                            xref_id=syn_row.xref_id + xref_offset,
+                            synonym=syn_row.synonym,
+                        )
+                    )
+
+                    syn_count += 1
+
+                logging.info(f"\tLoaded {syn_count} synonyms for '{species_name}'")
+
+                # Set dumped status
+                xref_dbi.execute(
+                    update(XrefUORM)
+                    .values(dumped="MAPPED")
+                    .where(XrefUORM.xref_id.in_(xref_list))
+                )
+
+            # Update release info
+            if source_row.source_release and source_row.source_release != "1":
+                core_dbi.execute(
+                    update(ExternalDbORM)
+                    .values(db_release=source_row.source_release)
+                    .where(ExternalDbORM.external_db_id == external_id)
+                )
+
+        # Update the unmapped xrefs
+        self.update_unmapped_xrefs(xref_dbi)
+
+        self.update_process_status("core_loaded")
+
+        xref_dbi.close()
+        core_dbi.close()
+
+    def delete_projection_data(self, dbi: Connection) -> None:
+        # Delete all the projections from the core DB
+
+        dbi.execute(delete(OntologyXrefORM))
+        logging.info("Deleted all ontology_xref rows")
+
+        row_count = dbi.execute(
+            update(GeneORM)
+            .values(display_xref_id=None, description=None)
+            .where(
+                XrefCORM.xref_id == GeneORM.display_xref_id,
+                XrefCORM.info_type == "PROJECTION",
+            )
+        ).rowcount
+        logging.info(
+            f"Set display_xref_id and description to NULL in {row_count} gene row(s) related to PROJECTION xrefs"
+        )
+
+        counts = {}
+        counts["external_synonym"] = dbi.execute(
+            delete(ExternalSynonymORM).where(
+                XrefCORM.xref_id == ExternalSynonymORM.xref_id,
+                XrefCORM.info_type == "PROJECTION",
+            )
+        ).rowcount
+        counts["dependent_xref"] = dbi.execute(
+            delete(DependentXrefCORM).where(
+                XrefCORM.xref_id == DependentXrefCORM.dependent_xref_id,
+                XrefCORM.info_type == "PROJECTION",
+            )
+        ).rowcount
+        counts["object_xref"] = dbi.execute(
+            delete(ObjectXrefCORM).where(
+                XrefCORM.xref_id == ObjectXrefCORM.xref_id,
+                XrefCORM.info_type == "PROJECTION",
+            )
+        ).rowcount
+        counts["xref"] = dbi.execute(
+            delete(XrefCORM).where(XrefCORM.info_type == "PROJECTION")
+        ).rowcount
+
+        logging.info(
+            f"Deleted all PROJECTIONs rows: {counts['external_synonym']} external_synonyms, {counts['dependent_xref']} dependent_xrefs, {counts['object_xref']} object_xrefs, {counts['xref']} xrefs"
+        )
+
+    def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection, core_dbi: Connection) -> None:
+        # For each external_db to be updated, delete the existing xrefs
+        query = (
+            select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count"))
+            .where(
+                XrefUORM.xref_id == ObjectXrefUORM.xref_id,
+                XrefUORM.source_id == SourceUORM.source_id,
+            )
+            .group_by(SourceUORM.name)
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if not name_to_external_db_id.get(row.name):
+                continue
+
+            name = row.name
+            external_db_id = name_to_external_db_id[name]
+            counts = {"master_dependent_xref": 0, "master_object_xref": 0}
+
+            logging.info(f"For source '{name}'")
+
+            counts["gene"] = core_dbi.execute(
+                update(GeneORM)
+                .values(display_xref_id=None, description=None)
+                .where(
+                    GeneORM.display_xref_id == XrefCORM.xref_id,
+                    XrefCORM.external_db_id == external_db_id,
+                )
+            ).rowcount
+            logging.info(
+                f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)"
+            )
+
+            counts["external_synonym"] = core_dbi.execute(
+                delete(ExternalSynonymORM).where(
+                    ExternalSynonymORM.xref_id == XrefCORM.xref_id,
+                    XrefCORM.external_db_id == external_db_id,
+                )
+            ).rowcount
+            counts["identity_xref"] = core_dbi.execute(
+                delete(IdentityXrefCORM).where(
+                    IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id,
+                    ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+                    XrefCORM.external_db_id == external_db_id,
+                )
+            ).rowcount
+            counts["object_xref"] = core_dbi.execute(
+                delete(ObjectXrefCORM).where(
+                    ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+                    XrefCORM.external_db_id == external_db_id,
+                )
+            ).rowcount
+
+            MasterXref = aliased(XrefCORM)
+            DependentXref = aliased(XrefCORM)
+
+            query = select(
+                ObjectXrefCORM.object_xref_id,
+                DependentXrefCORM.master_xref_id,
+                DependentXrefCORM.dependent_xref_id,
+            ).where(
+                ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id,
+                MasterXref.xref_id == DependentXrefCORM.master_xref_id,
+                DependentXref.xref_id == DependentXrefCORM.dependent_xref_id,
+                MasterXref.external_db_id == external_db_id,
+            )
+            for row in core_dbi.execute(query).mappings().all():
+                counts["master_dependent_xref"] += core_dbi.execute(
+                    delete(DependentXrefCORM).where(
+                        DependentXrefCORM.master_xref_id == row.master_xref_id,
+                        DependentXrefCORM.dependent_xref_id == row.dependent_xref_id,
+                    )
+                ).rowcount
+                counts["master_object_xref"] += core_dbi.execute(
+                    delete(ObjectXrefCORM).where(
+                        ObjectXrefCORM.object_xref_id == row.object_xref_id
+                    )
+                ).rowcount
+
+            counts["dependent_xref"] = core_dbi.execute(
+                delete(DependentXrefCORM).where(
+                    DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id,
+                    XrefCORM.external_db_id == external_db_id,
+                )
+            ).rowcount
+            counts["xref"] = core_dbi.execute(
+                delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id)
+            ).rowcount
+            counts["unmapped_object"] = core_dbi.execute(
+                delete(UnmappedObjectORM).where(
+                    UnmappedObjectORM.unmapped_object_type == "xref",
+                    UnmappedObjectORM.external_db_id == external_db_id,
+                )
+            ).rowcount
+
+            logging.info(
+                f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects"
+            )
+
+    def get_analysis(self, dbi: Connection) -> Dict[str, int]:
+        analysis_ids = {}
+        type_to_logic_name = {
+            "Gene": "xrefexoneratedna",
+            "Transcript": "xrefexoneratedna",
+            "Translation": "xrefexonerateprotein",
+        }
+
+        for object_type in ["Gene", "Transcript", "Translation"]:
+            logic_name = type_to_logic_name[object_type]
+            analysis_ids[object_type] = self.get_single_analysis(logic_name, dbi)
+
+        analysis_ids["checksum"] = self.get_single_analysis("xrefchecksum", dbi)
+
+        return analysis_ids
+
+    def get_single_analysis(self, logic_name: str, dbi: Connection) -> int:
+        analysis_id = dbi.execute(
+            select(AnalysisORM.analysis_id).where(AnalysisORM.logic_name == logic_name)
+        ).scalar()
+
+        if not analysis_id:
+            Session = sessionmaker(self.core())
+            with Session.begin() as session:
+                now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                analysis_object = AnalysisORM(logic_name=logic_name, created=now)
+                session.add(analysis_object)
+                session.flush()
+                analysis_id = analysis_object.analysis_id
+
+        return analysis_id
+
+    def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
+        xref_id = args["xref_id"]
+        accession = args["accession"]
+        external_db_id = args["external_db_id"]
+        label = args["label"]
+        description = args["description"]
+        version = args["version"]
+        info_type = args["info_type"]
+        info_text = args["info_text"]
+
+        new_xref_id = dbi.execute(
+            select(XrefCORM.xref_id).where(
+                XrefCORM.dbprimary_acc == accession,
+                XrefCORM.external_db_id == external_db_id,
+                XrefCORM.info_type == info_type,
+                XrefCORM.info_text == info_text,
+                XrefCORM.version == version,
+            )
+        ).scalar()
+
+        if not new_xref_id:
+            dbi.execute(
+                insert(XrefCORM).values(
+                    xref_id=xref_id + offset,
+                    external_db_id=external_db_id,
+                    dbprimary_acc=accession,
+                    display_label=label,
+                    version=version,
+                    description=description,
+                    info_type=info_type,
+                    info_text=info_text,
+                )
+            )
+
+            return xref_id
+        else:
+            return int(new_xref_id) - offset
+
+    def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
+        object_xref_id = args["object_xref_id"]
+        ensembl_id = args["ensembl_id"]
+        ensembl_type = args["ensembl_type"]
+        xref_id = args["xref_id"]
+        analysis_id = args["analysis_id"]
+
+        new_object_xref_id = dbi.execute(
+            select(ObjectXrefCORM.object_xref_id).where(
+                ObjectXrefCORM.xref_id == xref_id,
+                ObjectXrefCORM.ensembl_object_type == ensembl_type,
+                ObjectXrefCORM.ensembl_id == ensembl_id,
+                ObjectXrefCORM.analysis_id == analysis_id,
+            )
+        ).scalar()
+
+        if not new_object_xref_id:
+            dbi.execute(
+                insert(ObjectXrefCORM).values(
+                    object_xref_id=object_xref_id + offset,
+                    ensembl_id=ensembl_id,
+                    ensembl_object_type=ensembl_type,
+                    xref_id=xref_id,
+                    analysis_id=analysis_id,
+                )
+            )
+
+            return object_xref_id
+        else:
+            return int(new_object_xref_id) - offset
+
+    def update_unmapped_xrefs(self, dbi: Connection) -> None:
+        logging.info("Updating unmapped xrefs in xref DB")
+
+        # Direct xrefs
+        query = (
+            select(XrefUORM.xref_id)
+            .outerjoin(ObjectXrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+            .where(
+                XrefUORM.source_id == SourceUORM.source_id,
+                XrefUORM.dumped == None,
+                ObjectXrefUORM.ox_status != "FAILED_PRIORITY",
+                XrefUORM.info_type == "DIRECT",
+            )
+        )
+        result = dbi.execute(query).fetchall()
+        xref_ids = [row[0] for row in result]
+        dbi.execute(
+            update(XrefUORM)
+            .values(dumped="UNMAPPED_NO_STABLE_ID")
+            .where(XrefUORM.xref_id.in_(xref_ids))
+        )
+
+        # Misc xrefs
+        dbi.execute(
+            update(XrefUORM)
+            .values(dumped="UNMAPPED_NO_MAPPING")
+            .where(
+                XrefUORM.source_id == SourceUORM.source_id,
+                XrefUORM.dumped == None,
+                XrefUORM.info_type == "MISC",
+            )
+        )
+
+        # Dependent xrefs
+        MasterXref = aliased(XrefUORM)
+        DependentXref = aliased(XrefUORM)
+        query = (
+            select(DependentXref.xref_id)
+            .outerjoin(
+                DependentXrefUORM,
+                DependentXrefUORM.dependent_xref_id == DependentXref.xref_id,
+            )
+            .outerjoin(ObjectXrefUORM, ObjectXrefUORM.xref_id == DependentXref.xref_id)
+            .where(
+                DependentXref.source_id == SourceUORM.source_id,
+                DependentXrefUORM.master_xref_id == MasterXref.xref_id,
+                DependentXref.dumped == None,
+                ObjectXrefUORM.ox_status != "FAILED_PRIORITY",
+                DependentXref.info_type == "DEPENDENT",
+            )
+        )
+        result = dbi.execute(query).fetchall()
+        xref_ids = [row[0] for row in result]
+        dbi.execute(
+            update(XrefUORM)
+            .values(dumped="UNMAPPED_MASTER_FAILED")
+            .where(XrefUORM.xref_id.in_(xref_ids))
+        )
+
+        # Sequence match
+        query = (
+            select(XrefUORM.xref_id)
+            .outerjoin(ObjectXrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+            .outerjoin(
+                IdentityXrefUORM,
+                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+            )
+            .where(
+                XrefUORM.source_id == SourceUORM.source_id,
+                XrefUORM.xref_id == PrimaryXrefORM.xref_id,
+                XrefUORM.dumped == None,
+                XrefUORM.info_type == "SEQUENCE_MATCH",
+            )
+        )
+        result = dbi.execute(query).fetchall()
+        xref_ids = [row[0] for row in result]
+        dbi.execute(
+            update(XrefUORM)
+            .values(dumped="UNMAPPED_NO_MAPPING")
+            .where(XrefUORM.xref_id.in_(xref_ids))
+        )
+
+        # Dependents with non existent masters (none on time of loading)
+        dbi.execute(
+            update(XrefUORM)
+            .values(dumped="UNMAPPED_NO_MASTER")
+            .where(
+                XrefUORM.source_id == SourceUORM.source_id,
+                XrefUORM.dumped == None,
+                XrefUORM.info_type == "DEPENDENT",
+            )
+        )
diff --git a/src/python/ensembl/production/xrefs/mappers/__init__.py b/src/python/ensembl/production/xrefs/mappers/__init__.py
new file mode 100644
index 000000000..a3a8b1334
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref mappers modules."""
diff --git a/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py
new file mode 100644
index 000000000..b97b858c7
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py
@@ -0,0 +1,91 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Base method module for handling checksums."""
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+import hashlib
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
+from sqlalchemy.engine import Connection
+from typing import List, Dict, Any
+
+DEFAULT_BATCH_SIZE = 1000
+DEFAULT_LOG_SIZE = 10000
+
+
+class ChecksumBasic:
+    def __init__(self, args: Dict[str, Any] = None) -> None:
+        if args is None:
+            args = {}
+
+        self._mapper = args.get("MAPPER")
+        if args.get("BATCH_SIZE"):
+            self._batch_size = args["BATCH_SIZE"]
+        else:
+            self._batch_size = DEFAULT_BATCH_SIZE
+
+    def mapper(self, mapper: BasicMapper = None) -> BasicMapper:
+        if mapper:
+            self._mapper = mapper
+
+        return self._mapper
+
+    def batch_size(self, batch_size: int = None) -> int:
+        if batch_size:
+            self._batch_size = batch_size
+
+        return self._batch_size
+
+    def run(self, target: str, source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]:
+        results, tmp_list = [], []
+        count, total_count = 0, 0
+        batch_size = self.batch_size()
+
+        for record in SeqIO.parse(target, "fasta"):
+            tmp_list.append(record)
+            count += 1
+
+            if (count % batch_size) == 0:
+                res = self.perform_mapping(tmp_list, source_id, object_type, dbi)
+                for row in res:
+                    results.append(row)
+
+                total_count += count
+                if total_count % DEFAULT_LOG_SIZE:
+                    self.mapper().log_progress(
+                        f"Finished batch mapping of {total_count} sequences"
+                    )
+                count = 0
+                tmp_list.clear()
+
+        # Final mapping if there were some left over
+        if len(tmp_list) > 0:
+            self.mapper().log_progress(
+                f"Finished batch mapping of {total_count} sequences"
+            )
+            res = self.perform_mapping(tmp_list, source_id, object_type, dbi)
+            for row in res:
+                results.append(row)
+            tmp_list.clear()
+
+        return results
+
+    def md5_checksum(self, sequence: Seq) -> str:
+        digest = hashlib.md5()
+        digest.update(sequence.encode())
+
+        return digest.hexdigest()
diff --git a/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py
new file mode 100644
index 000000000..993753cd6
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py
@@ -0,0 +1,48 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Base method module for handling mysql checksums."""
+
+from ensembl.production.xrefs.mappers.methods.ChecksumBasic import *
+
+from sqlalchemy import select
+from ensembl.xrefs.xref_source_db_model import ChecksumXref as ChecksumXrefSORM
+
+
+class MySQLChecksum(ChecksumBasic):
+    def perform_mapping(self, sequences: List[SeqRecord], source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]:
+        final_results = []
+
+        for sequence in sequences:
+            checksum = self.md5_checksum(str(sequence.seq)).upper()
+            upi = None
+
+            query = select(ChecksumXrefSORM.accession).where(
+                ChecksumXrefSORM.checksum == checksum,
+                ChecksumXrefSORM.source_id == source_id,
+            )
+            for row in dbi.execute(query).mappings().all():
+                local_upi = row.accession
+                if upi:
+                    raise LookupError(
+                        f"The sequence {sequence.id} had a checksum of {checksum} but this resulted in more than one UPI: [{upi}, {local_upi}]"
+                    )
+                upi = local_upi
+
+            if upi:
+                final_results.append(
+                    {"id": sequence.id, "upi": upi, "object_type": object_type}
+                )
+
+        return final_results
diff --git a/src/python/ensembl/production/xrefs/mappers/methods/__init__.py b/src/python/ensembl/production/xrefs/mappers/methods/__init__.py
new file mode 100644
index 000000000..33a2087e1
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/methods/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref mapper methods modules."""
diff --git a/src/python/ensembl/production/xrefs/mappers/species/__init__.py b/src/python/ensembl/production/xrefs/mappers/species/__init__.py
new file mode 100644
index 000000000..9685c28ca
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref species-specific mapper modules."""
diff --git a/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py
new file mode 100644
index 000000000..3a2b20dbd
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py
@@ -0,0 +1,39 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species aedes_aegypti."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class aedes_aegypti(BasicMapper):
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_External_Description",
+        ]
+
+        return sources_list
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_External_Description",
+        ]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
diff --git a/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py
new file mode 100644
index 000000000..46e30cf99
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py
@@ -0,0 +1,42 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species anopheles_gambiae."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class anopheles_gambiae(BasicMapper):
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_RNA_Description",
+        ]
+
+        return sources_list
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_RNA_Description",
+        ]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        return []
diff --git a/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py
new file mode 100644
index 000000000..36a5f6696
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py
@@ -0,0 +1,49 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species culex_quinquefasciatus."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class culex_quinquefasciatus(BasicMapper):
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_RNA_Description",
+            "VB_External_Description",
+        ]
+
+        return sources_list
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_RNA_Description",
+            "VB_External_Description",
+        ]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        return []
+
+    def no_source_label_list(self) -> List[str]:
+        sources_list = ["VB_RNA_Description", "VB_External_Description"]
+
+        return sources_list
diff --git a/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py
new file mode 100644
index 000000000..3a2b155ec
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py
@@ -0,0 +1,30 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species danio_rerio."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
+
+
+class danio_rerio(BasicMapper):
+    def set_display_xrefs(self) -> None:
+        display = DisplayXrefs(self)
+        display.set_display_xrefs_from_stable_table()
+
+    def official_name(self) -> str:
+        return "ZFIN_ID"
+
+    def set_transcript_names(self) -> None:
+        return None
diff --git a/src/python/ensembl/production/xrefs/mappers/species/drosophila.py b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py
new file mode 100644
index 000000000..2e327a735
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py
@@ -0,0 +1,44 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species drosophila."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class drosophila(BasicMapper):
+    def gene_description_filter_regexps(self) -> List[str]:
+        return []
+
+    def gene_description_sources(self) -> List[str]:
+        sources_list = ["FlyBaseName_gene", "FlyBaseCGID_gene"]
+
+        return sources_list
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["FlyBaseName_transcript", "FlyBaseCGID_transcript"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["FlyBaseName_gene", "FlyBaseCGID_gene", "flybase_gene_id"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def set_transcript_names(self) -> None:
+        return None
diff --git a/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py
new file mode 100644
index 000000000..1791da9c5
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py
@@ -0,0 +1,277 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species eukaryota."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class eukaryota(BasicMapper):
+    def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "TAIR_SYMBOL",
+            "RFAM",
+            "RNAMMER",
+            "TRNASCAN_SE",
+            "Uniprot_gn",
+            "ENA_GENE",
+            "BROAD_U_maydis",
+            "BROAD_F_oxysporum",
+            "BROAD_G_zeae",
+            "BROAD_G_moniliformis",
+            "BROAD_P_infestans",
+            "phyra_jgi_v1.1",
+            "physo1_jgi_v1.1",
+            "phatr_jgi_v2",
+            "phatr_jgi_v2_bd",
+            "PGD_GENE",
+            "Mycgr3_jgi_v2.0_gene",
+            "BROAD_Magnaporthe_DB",
+            "PHYTOZOME_GMAX_GENE",
+        ]
+
+        ignore_queries = {}
+
+        # Ignore EntrezGene labels dependent on predicted RefSeqs
+        MasterXref = aliased(XrefUORM)
+        DependentXref = aliased(XrefUORM)
+        MasterSource = aliased(SourceUORM)
+        DependentSource = aliased(SourceUORM)
+
+        query = select(ObjectXrefUORM.object_xref_id.distinct()).where(
+            ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
+            ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id,
+            DependentXrefUORM.dependent_xref_id == DependentXref.xref_id,
+            DependentXrefUORM.master_xref_id == MasterXref.xref_id,
+            MasterXref.source_id == MasterSource.source_id,
+            DependentXref.source_id == DependentSource.source_id,
+            MasterSource.name.like("Refseq%predicted"),
+            DependentSource.name.like("EntrezGene"),
+            ObjectXrefUORM.ox_status == "DUMP_OUT",
+        )
+        ignore_queries["EntrezGene"] = query
+
+        query = (
+            select(ObjectXrefUORM.object_xref_id)
+            .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+            .join(SourceUORM, SourceUORM.source_id == XrefUORM.source_id)
+            .where(
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                XrefUORM.label.regexp_match("^LOC[[:digit:]]+"),
+            )
+        )
+        ignore_queries["LOC_prefix"] = query
+
+        return sources_list, ignore_queries
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "RFAM",
+            "RNAMMER",
+            "TRNASCAN_SE",
+            "Uniprot_gn_trans_name",
+            "ENA_GENE",
+            "BROAD_U_maydis",
+            "BROAD_F_oxysporum",
+            "BROAD_G_zeae",
+            "BROAD_G_moniliformis",
+            "BROAD_P_infestans",
+            "phyra_jgi_v1.1",
+            "physo1_jgi_v1.1",
+            "phatr_jgi_v2",
+            "phatr_jgi_v2_bd",
+            "PGD_GENE",
+            "Mycgr3_jgi_v2.0_gene",
+            "BROAD_Magnaporthe_DB",
+            "PHYTOZOME_GMAX_GENE",
+        ]
+
+        ignore_queries = {}
+
+        # Ignore EntrezGene labels dependent on predicted RefSeqs
+        MasterXref = aliased(XrefUORM)
+        DependentXref = aliased(XrefUORM)
+        MasterSource = aliased(SourceUORM)
+        DependentSource = aliased(SourceUORM)
+
+        query = select(ObjectXrefUORM.object_xref_id.distinct()).where(
+            ObjectXrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
+            ObjectXrefUORM.master_xref_id == DependentXrefUORM.master_xref_id,
+            DependentXrefUORM.dependent_xref_id == DependentXref.xref_id,
+            DependentXrefUORM.master_xref_id == MasterXref.xref_id,
+            MasterXref.source_id == MasterSource.source_id,
+            DependentXref.source_id == DependentSource.source_id,
+            MasterSource.name.like("Refseq%predicted"),
+            DependentSource.name.like("EntrezGene"),
+            ObjectXrefUORM.ox_status == "DUMP_OUT",
+        )
+        ignore_queries["EntrezGene"] = query
+
+        query = (
+            select(ObjectXrefUORM.object_xref_id)
+            .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
+            .join(SourceUORM, SourceUORM.source_id == XrefUORM.source_id)
+            .where(
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                XrefUORM.label.regexp_match("^LOC[[:digit:]]+"),
+            )
+        )
+        ignore_queries["LOC_prefix"] = query
+
+        return sources_list, ignore_queries
+
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "TAIR_LOCUS",
+            "PomBase_GENE",
+            "PomBase_TRANSCRIPT",
+            "Uniprot/SWISSPROT",
+            "Uniprot/SPTREMBL",
+            "BROAD_U_maydis",
+            "BROAD_F_oxysporum",
+            "BROAD_G_zeae",
+            "BROAD_G_moniliformis",
+            "BROAD_P_infestans",
+            "phyra_jgi_v1.1",
+            "physo1_jgi_v1.1",
+            "phatr_jgi_v2",
+            "phatr_jgi_v2_bd",
+            "PGD_GENE",
+            "BROAD_Magnaporthe_DB",
+            "PGSC_GENE",
+            "PHYTOZOME_GMAX_GENE",
+            "RFAM",
+            "TRNASCAN_SE",
+            "RNAMMER",
+        ]
+
+        return sources_list
+
+    def set_transcript_names(self) -> None:
+        logging.info("Assigning transcript names from gene names")
+
+        core_dbi = self.core().connect()
+
+        # Reset transcript display xrefs
+        core_dbi.execute(update(TranscriptORM).values(display_xref_id=None))
+
+        # Get the max xref and object_xref IDs
+        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
+        xref_id = int(xref_id)
+        object_xref_id = core_dbi.execute(
+            select(func.max(ObjectXrefCORM.object_xref_id))
+        ).scalar()
+        object_xref_id = int(object_xref_id)
+
+        # Get all genes with set display_xref_id
+        query = select(
+            GeneORM.gene_id,
+            ExternalDbORM.db_name,
+            XrefCORM.dbprimary_acc,
+            XrefCORM.display_label,
+            XrefCORM.description,
+        ).where(
+            GeneORM.display_xref_id == XrefCORM.xref_id,
+            XrefCORM.external_db_id == ExternalDbORM.external_db_id,
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            # Get the ID of transcript name external DB
+            external_db_id = core_dbi.execute(
+                select(ExternalDbORM.external_db_id).where(
+                    ExternalDbORM.db_name.like(f"{row.db_name}_trans_name")
+                )
+            ).scalar()
+
+            if not external_db_id:
+                raise LookupError(
+                    f"No external_db_id found for '{row.db_name}_trans_name'"
+                )
+
+            # Get transcripts related to current gene
+            query = (
+                select(TranscriptORM.transcript_id)
+                .where(TranscriptORM.gene_id == row.gene_id)
+                .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end)
+            )
+            for transcript_row in core_dbi.execute(query).mappings().all():
+                object_xref_id += 1
+
+                # Check if xref already exists
+                insert_xref_id = core_dbi.execute(
+                    select(XrefCORM.xref_id).where(
+                        XrefCORM.external_db_id == external_db_id,
+                        XrefCORM.display_label == row.display_label,
+                        XrefCORM.version == 0,
+                        XrefCORM.description == row.description,
+                        XrefCORM.info_type == "MISC",
+                        XrefCORM.info_text == "via gene name",
+                    )
+                ).scalar()
+
+                if not insert_xref_id:
+                    xref_id += 1
+
+                    # Insert new xref
+                    core_dbi.execute(
+                        insert(XrefCORM)
+                        .values(
+                            xref_id=xref_id,
+                            external_db_id=external_db_id,
+                            dbprimary_acc=row.display_label,
+                            display_label=row.display_label,
+                            version=0,
+                            description=row.description,
+                            info_type="MISC",
+                            info_text="via gene name",
+                        )
+                        .prefix_with("IGNORE")
+                    )
+
+                    insert_xref_id = xref_id
+
+                # Insert object xref
+                core_dbi.execute(
+                    insert(ObjectXrefCORM).values(
+                        object_xref_id=object_xref_id,
+                        ensembl_id=transcript_row.transcript_id,
+                        ensembl_object_type="Transcript",
+                        xref_id=insert_xref_id,
+                    )
+                )
+
+                # Set transcript dispay xref
+                core_dbi.execute(
+                    update(TranscriptORM)
+                    .values(display_xref_id=insert_xref_id)
+                    .where(TranscriptORM.transcript_id == transcript_row.transcript_id)
+                )
+
+                ext += 1
+
+        # Delete object xrefs with no matching xref
+        query = (
+            select(ObjectXrefCORM.object_xref_id)
+            .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id)
+            .where(XrefCORM.xref_id == None)
+        )
+        result = core_dbi.execute(query).fetchall()
+        object_xref_ids = [row[0] for row in result]
+
+        core_dbi.execute(
+            delete(ObjectXrefCORM).where(
+                ObjectXrefCORM.object_xref_id.in_(object_xref_ids)
+            )
+        )
+
+        core_dbi.close()
diff --git a/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py
new file mode 100644
index 000000000..616bd7326
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py
@@ -0,0 +1,29 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species homo_sapiens."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class homo_sapiens(BasicMapper):
+    def official_name(self) -> str:
+        return "HGNC"
+
+    def set_transcript_names(self) -> None:
+        return None
+
+    def set_display_xrefs(self) -> None:
+        display = DisplayXrefs(self)
+        display.set_display_xrefs_from_stable_table()
diff --git a/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py
new file mode 100644
index 000000000..5861e03a7
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py
@@ -0,0 +1,42 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species ixodes_scapularis."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class ixodes_scapularis(BasicMapper):
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_External_Description",
+        ]
+
+        return sources_list
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = [
+            "VB_Community_Annotation",
+            "Uniprot/SWISSPROT",
+            "VB_External_Description",
+        ]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        return []
diff --git a/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py
new file mode 100644
index 000000000..cde22b34f
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py
@@ -0,0 +1,29 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species mus_musculus."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class mus_musculus(BasicMapper):
+    def official_name(self) -> str:
+        return "MGI"
+
+    def set_transcript_names(self) -> None:
+        return None
+
+    def set_display_xrefs(self) -> None:
+        display = DisplayXrefs(self)
+        display.set_display_xrefs_from_stable_table()
diff --git a/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py
new file mode 100644
index 000000000..df2bb072c
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py
@@ -0,0 +1,33 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species neurospora_crassa."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class neurospora_crassa(BasicMapper):
+    def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["Uniprot_gn"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["Uniprot_gn"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
diff --git a/src/python/ensembl/production/xrefs/mappers/species/parasite.py b/src/python/ensembl/production/xrefs/mappers/species/parasite.py
new file mode 100644
index 000000000..408d84d08
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/parasite.py
@@ -0,0 +1,46 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species parasite."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class parasite(BasicMapper):
+    def set_transcript_names(self) -> None:
+        return None
+
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "RFAM",
+            "RNAMMER",
+            "TRNASCAN_SE",
+            "miRBase",
+            "HGNC",
+            "IMGT/GENE_DB",
+            "Uniprot/SWISSPROT",
+            "RefSeq_peptide",
+            "Uniprot/SPTREMBL",
+        ]
+
+        return sources_list
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        regex = [
+            r"^Uncharacterized protein\s*",
+            r"^Putative uncharacterized protein\s*",
+            r"^Hypothetical protein\s*",
+        ]
+
+        return regex
diff --git a/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py
new file mode 100644
index 000000000..53925875d
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py
@@ -0,0 +1,29 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species rattus_norvegicus."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class rattus_norvegicus(BasicMapper):
+    def official_name(self) -> str:
+        return "RGD"
+
+    def set_transcript_names(self) -> None:
+        return None
+
+    def set_display_xrefs(self) -> None:
+        display = DisplayXrefs(self)
+        display.set_display_xrefs_from_stable_table()
diff --git a/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py
new file mode 100644
index 000000000..707dcc7db
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py
@@ -0,0 +1,41 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species saccharomyces_cerevisiae."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class saccharomyces_cerevisiae(BasicMapper):
+    def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["SGD_GENE"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["SGD_TRANSCRIPT"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def gene_description_sources(self) -> List[str]:
+        sources_list = ["SGD_GENE"]
+
+        return sources_list
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        return []
diff --git a/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py
new file mode 100644
index 000000000..742f1207c
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py
@@ -0,0 +1,131 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species sars_cov_2."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class sars_cov_2(BasicMapper):
+    def set_transcript_names(self) -> None:
+        logging.info("Assigning transcript names from gene names")
+
+        core_dbi = self.core().connect()
+
+        # Reset transcript display xrefs
+        core_dbi.execute(update(TranscriptORM).values(display_xref_id=None))
+
+        # Get the max xref and object_xref IDs
+        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
+        xref_id = int(xref_id)
+        object_xref_id = core_dbi.execute(
+            select(func.max(ObjectXrefCORM.object_xref_id))
+        ).scalar()
+        object_xref_id = int(object_xref_id)
+
+        # Delete transcript name xrefs
+        core_dbi.execute(
+            delete(XrefCORM).where(
+                XrefCORM.xref_id == ObjectXrefCORM.xref_id,
+                ObjectXrefCORM.ensembl_object_type == "Transcript",
+                ExternalDbORM.external_db_id == XrefCORM.external_db_id,
+                ExternalDbORM.db_name.like("%_trans_name"),
+            )
+        )
+
+        # Get all genes with set display_xref_id
+        query = select(
+            GeneORM.gene_id,
+            ExternalDbORM.db_name,
+            XrefCORM.dbprimary_acc,
+            XrefCORM.display_label,
+            XrefCORM.description,
+        ).where(
+            GeneORM.display_xref_id == XrefCORM.xref_id,
+            XrefCORM.external_db_id == ExternalDbORM.external_db_id,
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            # Get the ID of transcript name external DB
+            external_db_id = core_dbi.execute(
+                select(ExternalDbORM.external_db_id).where(
+                    ExternalDbORM.db_name.like(f"{row.db_name}_trans_name")
+                )
+            ).scalar()
+
+            if not external_db_id:
+                raise LookupError(
+                    f"No external_db_id found for '{row.db_name}_trans_name'"
+                )
+
+            # Get transcripts related to current gene
+            query = (
+                select(TranscriptORM.transcript_id)
+                .where(TranscriptORM.gene_id == row.gene_id)
+                .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end)
+            )
+            for transcript_row in core_dbi.execute(query).mappings().all():
+                xref_id += 1
+                object_xref_id += 1
+
+                info_text = f"via gene {row.dbprimary_acc}"
+
+                # Insert new xref
+                core_dbi.execute(
+                    insert(XrefCORM)
+                    .values(
+                        xref_id=xref_id,
+                        external_db_id=external_db_id,
+                        dbprimary_acc=row.display_label,
+                        display_label=row.display_label,
+                        version=0,
+                        description=row.description,
+                        info_type="MISC",
+                        info_text=info_text,
+                    )
+                    .prefix_with("IGNORE")
+                )
+
+                # Insert object xref
+                core_dbi.execute(
+                    insert(ObjectXrefCORM).values(
+                        object_xref_id=object_xref_id,
+                        ensembl_id=transcript_row.transcript_id,
+                        ensembl_object_type="Transcript",
+                        xref_id=xref_id,
+                    )
+                )
+
+                # Set transcript display xref
+                core_dbi.execute(
+                    update(TranscriptORM)
+                    .values(display_xref_id=xref_id)
+                    .where(TranscriptORM.transcript_id == transcript_row.transcript_id)
+                )
+
+        # Delete object xrefs with no matching xref
+        query = (
+            select(ObjectXrefCORM.object_xref_id)
+            .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id)
+            .where(XrefCORM.xref_id == None)
+        )
+        result = core_dbi.execute(query).fetchall()
+        object_xref_ids = [row[0] for row in result]
+
+        core_dbi.execute(
+            delete(ObjectXrefCORM).where(
+                ObjectXrefCORM.object_xref_id.in_(object_xref_ids)
+            )
+        )
+
+        core_dbi.close()
diff --git a/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py
new file mode 100644
index 000000000..8c7d66d8e
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py
@@ -0,0 +1,41 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species schizosaccharomyces_pombe."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class schizosaccharomyces_pombe(BasicMapper):
+    def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["PomBase_GENE"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
+        sources_list = ["PomBase_TRANSCRIPT"]
+
+        ignore_queries = {}
+
+        return sources_list, ignore_queries
+
+    def gene_description_sources(self) -> List[str]:
+        sources_list = ["PomBase_GENE"]
+
+        return sources_list
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        return []
diff --git a/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py
new file mode 100644
index 000000000..a3182e7f7
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py
@@ -0,0 +1,29 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species sus_scrofa."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class sus_scrofa(BasicMapper):
+    def official_name(self) -> str:
+        return "PIGGY"
+
+    def set_transcript_names(self) -> None:
+        return None
+
+    def set_display_xrefs(self) -> None:
+        display = DisplayXrefs(self)
+        display.set_display_xrefs_from_stable_table()
diff --git a/src/python/ensembl/production/xrefs/mappers/species/wormbase.py b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py
new file mode 100644
index 000000000..796d6260e
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py
@@ -0,0 +1,124 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Mapper extension module for species wormbase."""
+
+from ensembl.production.xrefs.mappers.BasicMapper import *
+
+
+class wormbase(BasicMapper):
+    def set_display_xrefs(self) -> None:
+        logging.info(
+            "Building Transcript and Gene display_xrefs using WormBase direct xrefs"
+        )
+
+        core_dbi = self.core().connect()
+
+        external_dbs, gene_display_xrefs, transcript_display_xrefs = {}, {}, {}
+
+        # Get external_db IDs for the sources we are interested in
+        query = select(ExternalDbORM.external_db_id, ExternalDbORM.db_name).where(
+            ExternalDbORM.db_name.like("wormbase%")
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            external_dbs[row.db_name] = row.external_db_id
+
+        if not external_dbs.get("wormbase_transcript") or not external_dbs.get(
+            "wormbase_locus"
+        ):
+            logging.debug(
+                "Could not find wormbase_transcript and wormbase_locus in external_db table, so doing nothing"
+            )
+
+            xref_dbi.close()
+            core_dbi.close()
+
+            return
+
+        # Get genes with wormbase display xrefs
+        query = select(ObjectXrefCORM.ensembl_id, XrefCORM.xref_id).where(
+            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+            XrefCORM.external_db_id == external_dbs["wormbase_gseqname"],
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            gene_display_xrefs[row.ensembl_id] = row.xref_id
+
+        # Some genes will have a locus name. Overwrite display xrefs for those that do
+        query = select(ObjectXrefCORM.ensembl_id, XrefCORM.xref_id).where(
+            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+            XrefCORM.external_db_id == external_dbs["wormbase_locus"],
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            gene_display_xrefs[row.ensembl_id] = row.xref_id
+
+        # Get the wormbase_transcript xrefs for the genes
+        query = select(ObjectXrefCORM.ensembl_id, XrefCORM.xref_id).where(
+            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+            XrefCORM.external_db_id == external_dbs["wormbase_transcript"],
+        )
+        for row in core_dbi.execute(query).mappings().all():
+            transcript_display_xrefs[row.ensembl_id] = row.xref_id
+
+        # Reset gene and transcript display xrefs
+        core_dbi.execute(update(GeneORM).values(display_xref_id=None))
+        core_dbi.execute(update(TranscriptORM).values(display_xref_id=None))
+
+        # Now update
+        for gene_id, xref_id in gene_display_xrefs.items():
+            core_dbi.execute(
+                update(GeneORM)
+                .values(display_xref_id=xref_id)
+                .where(GeneORM.gene_id == gene_id)
+            )
+
+        for transcript_id, xref_id in gene_display_xrefs.items():
+            core_dbi.execute(
+                update(TranscriptORM)
+                .values(display_xref_id=xref_id)
+                .where(TranscriptORM.transcript_id == transcript_id)
+            )
+
+        core_dbi.close()
+
+        logging.info("Updated display xrefs in core for genes and transcripts")
+
+    def set_transcript_names(self) -> None:
+        return None
+
+    def gene_description_sources(self) -> List[str]:
+        sources_list = [
+            "RFAM",
+            "RNAMMER",
+            "TRNASCAN_SE",
+            "miRBase",
+            "HGNC",
+            "IMGT/GENE_DB",
+            "Uniprot/SWISSPROT",
+            "RefSeq_peptide",
+            "Uniprot/SPTREMBL",
+        ]
+
+        return sources_list
+
+    def gene_description_filter_regexps(self) -> List[str]:
+        regex = [
+            r"^(Protein \S+\s*)+$",
+            r"^Uncharacterized protein\s*\S+\s*",
+            r"^Uncharacterized protein\s*",
+            r"^Putative uncharacterized protein\s*\S+\s*",
+            r"^Putative uncharacterized protein\s*",
+            r"^Hypothetical protein\s*\S+\s*",
+        ]
+
+        return regex
diff --git a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
new file mode 100644
index 000000000..988b92ffa
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
@@ -0,0 +1,161 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for ArrayExpress source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class ArrayExpressParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id       = args["source_id"]
+        species_id      = args["species_id"]
+        species_name    = args["species_name"]
+        file            = args["file"]
+        dba             = args["dba"]
+        ensembl_release = args["ensembl_release"]
+        xref_dbi        = args["xref_dbi"]
+        verbose         = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Extract db connection parameters from file name
+        project, db_user, db_host, db_port, db_name, db_pass = (
+            self.extract_params_from_string(
+                file, ["project", "user", "host", "port", "dbname", "pass"]
+            )
+        )
+        if not db_user:
+            db_user = "ensro"
+        if not db_port:
+            db_port = "3306"
+
+        # Get the species name(s)
+        species_id_to_names = self.species_id_to_names(xref_dbi)
+        if species_name:
+            species_id_to_names.setdefault(species_id, []).append(species_name)
+
+        if not species_id_to_names.get(species_id):
+            return 0, "Skipped. Could not find species ID to name mapping"
+        names = species_id_to_names[species_id]
+
+        # Look up the species in ftp server and check if active
+        species_lookup = self._get_species()
+        active = self._is_active(species_lookup, names, verbose)
+        if not active:
+            return 0, "Skipped. ArrayExpress source not active for species"
+
+        species_name = species_id_to_names[species_id][0]
+
+        # Connect to the appropriate arrayexpress db
+        if db_host:
+            arrayexpress_db_url = URL.create(
+                "mysql", db_user, db_pass, db_host, db_port, db_name
+            )
+        elif project and project == "ensembl":
+            if verbose:
+                logging.info("Looking for db in mysql-ens-sta-1")
+            registry = "ensro@mysql-ens-sta-1:4519"
+            arrayexpress_db_url = self.get_db_from_registry(
+                species_name, "core", ensembl_release, registry
+            )
+        elif project and project == "ensemblgenomes":
+            if verbose:
+                logging.info(
+                    "Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2"
+                )
+            registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160"
+            arrayexpress_db_url = self.get_db_from_registry(
+                species_name, "core", ensembl_release, registry
+            )
+
+            if not arrayexpress_db_url:
+                registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275"
+                arrayexpress_db_url = self.get_db_from_registry(
+                    species_name, "core", ensembl_release, registry
+                )
+        elif dba:
+            arrayexpress_db_url = dba
+        else:
+            arrayexpress_db_url = None
+
+        if not arrayexpress_db_url:
+            raise IOError(
+                f"Could not find ArrayExpress DB. Missing or unsupported project value. Supported values: ensembl, ensemblgenomes."
+            )
+        else:
+            if verbose:
+                logging.info(f"Found ArrayExpress DB: {arrayexpress_db_url}")
+
+        xref_count = 0
+
+        db_engine = self.get_db_engine(arrayexpress_db_url)
+        with db_engine.connect() as arrayexpress_dbi:
+            query = select(GeneORM.stable_id).where(
+                GeneORM.biotype != "LRG_gene", GeneORM.is_current == 1
+            )
+            result = arrayexpress_dbi.execute(query).mappings().all()
+
+        # Add direct xref for every current gene found
+        for row in result:
+            xref_id = self.add_xref(
+                {
+                    "accession": row.stable_id,
+                    "label": row.stable_id,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "DIRECT",
+                },
+                xref_dbi,
+            )
+            self.add_direct_xref(xref_id, row.stable_id, "gene", "", xref_dbi)
+
+            xref_count += 1
+
+        result_message = f"Added {xref_count} DIRECT xrefs"
+
+        return 0, result_message
+
+    def _get_species(self) -> Dict[str, int]:
+        ftp_server = "ftp.ebi.ac.uk"
+        ftp_dir = "pub/databases/microarray/data/atlas/bioentity_properties/ensembl"
+
+        species_lookup = {}
+
+        ftp = FTP(ftp_server)
+        ftp.login("anonymous", "-anonymous@")
+        ftp.cwd(ftp_dir)
+        remote_files = ftp.nlst()
+        ftp.close()
+
+        for file in remote_files:
+            species = file.split(".")[0]
+            species_lookup[species] = 1
+
+        return species_lookup
+
+    def _is_active(self, species_lookup: Dict[str, int], names: List[str], verbose: bool) -> bool:
+        # Loop through the names and aliases first. If we get a hit then great
+        active = False
+        for name in names:
+            if species_lookup.get(name):
+                if verbose:
+                    logging.info(
+                        f"Found ArrayExpress has declared the name {name}. This was an alias"
+                    )
+                active = True
+                break
+
+        return active
diff --git a/src/python/ensembl/production/xrefs/parsers/BaseParser.py b/src/python/ensembl/production/xrefs/parsers/BaseParser.py
new file mode 100644
index 000000000..3ae7c2e2c
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/BaseParser.py
@@ -0,0 +1,972 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Base xref parser module to include all common functions used by xref parsers."""
+
+from ensembl.production.xrefs.Base import *
+
+
+class BaseParser(Base):
+    """Class to represent the base of xref parser modules. Inherits the xref Base class."""
+
+    def __init__(self, testing: bool = False) -> None:
+        if not testing:
+            super().__init__()
+
+        self._direct_xref_tables = {
+            "gene": GeneDirectXrefORM,
+            "transcript": TranscriptDirectXrefORM,
+            "translation": TranslationDirectXrefORM,
+        }
+        self._xref_dependent_mapped = {}
+
+    def get_source_id_for_source_name(self, source_name: str, dbi: Connection, priority_desc: str = None) -> int:
+        """Retrieves a source ID from its name and priority description from a database.
+
+        Parameters
+        ----------
+        source_name: str
+            The name of the source
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+        priority_desc: str, optional
+            The priority description of the source (default is None)
+
+        Returns
+        -------
+        The source ID.
+
+        Raises
+        ------
+        KeyError
+          If no ID was found for the provided source name.
+        """
+        low_name = source_name.lower()
+
+        if priority_desc:
+            low_desc = priority_desc.lower()
+            query = select(SourceUORM.source_id).where(
+                func.lower(SourceUORM.name) == low_name,
+                func.lower(SourceUORM.priority_description) == low_desc,
+            )
+            source_name = f"{source_name} ({priority_desc})"
+        else:
+            query = select(SourceUORM.source_id).where(
+                func.lower(SourceUORM.name) == low_name
+            )
+
+        result = dbi.execute(query)
+        if result:
+            source_id = result.scalar()
+        else:
+            raise KeyError(f"No source_id for source_name={source_name}")
+
+        return source_id
+
+    def get_source_name_for_source_id(self, source_id: int, dbi: Connection) -> str:
+        """Retrieves a source name from its ID from a database.
+
+        Parameters
+        ----------
+        source_id: int
+            The ID of the source
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        The source name.
+
+        Raises
+        ------
+        KeyError
+          If no name was found for the provided source ID.
+        """
+        result = dbi.execute(
+            select(SourceUORM.name).where(SourceUORM.source_id == source_id)
+        )
+        if result:
+            source_name = result.scalar()
+        else:
+            raise KeyError(
+                f"There is no entity with source-id {source_id} in the source-table of the xref-database. The source-id and the name of the source-id is hard-coded in populate_metadata.sql and in the parser. Couldn't get source name for source ID {source_id}"
+            )
+
+        return source_name
+
+    def set_release(self, source_id: int, s_release: str, dbi: Connection) -> None:
+        """Sets the release value for a source in the source table of a database.
+
+        Parameters
+        ----------
+        source_id: str
+            The source ID
+        s_release: str
+            The release string
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        """
+        dbi.execute(
+            update(SourceUORM)
+            .where(SourceUORM.source_id == source_id)
+            .values(source_release=s_release)
+        )
+
+    def upload_xref_object_graphs(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None:
+        """Adds xref data into a database.
+        Uploads main xref data, related direct xrefs, dependent xrefs, and synonyms.
+
+        Parameters
+        ----------
+        xrefs: list
+            List of xrefs to upload
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+
+        Raises
+        ------
+        IOError
+            Failure is setting or retrieving an xref ID.
+        """
+        count = len(xrefs)
+        if count:
+            for xref in xrefs:
+                if not xref.get("ACCESSION") or not xref.get("SOURCE_ID"):
+                    continue
+
+                # Create entry in xref table and get ID
+                xref_id = self.add_xref(
+                    {
+                        "accession": xref["ACCESSION"],
+                        "source_id": xref["SOURCE_ID"],
+                        "species_id": xref["SPECIES_ID"],
+                        "label": xref.get("LABEL", xref["ACCESSION"]),
+                        "description": xref.get("DESCRIPTION"),
+                        "version": xref.get("VERSION", 0),
+                        "info_type": xref.get("INFO_TYPE", "MISC"),
+                    },
+                    dbi,
+                    True,
+                )
+
+                # Add direct xrefs
+                if xref.get("DIRECT_XREFS"):
+                    for direct_xref in xref["DIRECT_XREFS"]:
+                        direct_xref_id = self.add_xref(
+                            {
+                                "accession": xref["ACCESSION"],
+                                "source_id": direct_xref["SOURCE_ID"],
+                                "species_id": xref["SPECIES_ID"],
+                                "label": xref.get("LABEL", xref["ACCESSION"]),
+                                "description": xref.get("DESCRIPTION"),
+                                "version": xref.get("VERSION", 0),
+                                "info_type": direct_xref.get("LINKAGE_TYPE"),
+                            },
+                            dbi,
+                            True,
+                        )
+
+                        # direct_xref_id = self.get_xref_id(xref['ACCESSION'], direct_xref['SOURCE_ID'], xref['SPECIES_ID'], dbi)
+                        self.add_direct_xref(
+                            direct_xref_id,
+                            direct_xref["STABLE_ID"],
+                            direct_xref["ENSEMBL_TYPE"],
+                            direct_xref["LINKAGE_TYPE"],
+                            dbi,
+                        )
+
+                # Error checking
+                if not xref_id:
+                    raise IOError(
+                        "xref_id is not set for %s %s %s %s %s"
+                        % (
+                            xref["ACCESSION"],
+                            xref["LABEL"],
+                            xref["DESCRIPTION"],
+                            xref["SOURCE_ID"],
+                            xref["SPECIES_ID"],
+                        )
+                    )
+
+                # Create entry in primary_xref table with sequence; if this is a "cumulative"
+                # entry it may already exist, and require an UPDATE rather than an INSERT
+                if xref.get("SEQUENCE"):
+                    exists = dbi.execute(
+                        select(PrimaryXrefORM.xref_id).where(
+                            PrimaryXrefORM.xref_id == xref_id
+                        )
+                    ).scalar()
+
+                    if exists:
+                        query = (
+                            update(PrimaryXrefORM)
+                            .where(PrimaryXrefORM.xref_id == xref_id)
+                            .values(sequence=xref["SEQUENCE"])
+                        )
+                    else:
+                        query = insert(PrimaryXrefORM).values(
+                            xref_id=xref_id,
+                            sequence=xref["SEQUENCE"],
+                            sequence_type=xref["SEQUENCE_TYPE"],
+                            status=xref.get("STATUS"),
+                        )
+                    dbi.execute(query)
+
+                # If there are synonyms, add entries in the synonym table
+                if xref.get("SYNONYMS"):
+                    for synonym in xref["SYNONYMS"]:
+                        self.add_synonym(xref_id, synonym, dbi)
+
+                # If there are dependent xrefs, add xrefs and dependent xrefs for them
+                if xref.get("DEPENDENT_XREFS"):
+                    for dependent_xref in xref.get("DEPENDENT_XREFS"):
+                        # Insert the xref and get its xref_id
+                        dependent_xref_id = self.add_xref(
+                            {
+                                "accession": dependent_xref["ACCESSION"],
+                                "source_id": dependent_xref["SOURCE_ID"],
+                                "species_id": xref["SPECIES_ID"],
+                                "label": dependent_xref.get("LABEL"),
+                                "description": dependent_xref.get("DESCRIPTION"),
+                                "version": dependent_xref.get("VERSION"),
+                                "info_type": "DEPENDENT",
+                            },
+                            dbi,
+                        )
+                        if not dependent_xref_id:
+                            continue
+
+                        # Add the linkage_annotation and source id it came from
+                        self.add_dependent_xref_maponly(
+                            dependent_xref_id,
+                            dependent_xref["LINKAGE_SOURCE_ID"],
+                            xref_id,
+                            dependent_xref.get("LINKAGE_ANNOTATION"),
+                            dbi,
+                        )
+
+                        # If there are synonyms, add entries in the synonym table
+                        if dependent_xref.get("SYNONYMS"):
+                            for synonym in dependent_xref.get("SYNONYMS"):
+                                self.add_synonym(dependent_xref_id, synonym, dbi)
+
+                # Add the pair data. refseq dna/pep pairs usually
+                if xref_id and xref.get("PAIR"):
+                    dbi.execute(
+                        insert(PairsORM).values(
+                            source_id=xref["SOURCE_ID"],
+                            accession1=xref["ACCESSION"],
+                            accession2=xref["PAIR"],
+                        )
+                    )
+
+    def get_xref_id(self, accession: str, source_id: int, species_id: int, dbi: Connection) -> Optional[int]:
+        """Retrieves the xref row ID from accession, source ID, and species ID.
+
+        Parameters
+        ----------
+        accession: str
+            The xref accession
+        source_id: int
+            The xref source ID
+        species_id: int
+            The species ID
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        The xref ID, if found (else None).
+        """
+        xref_ids = []
+
+        query = select(XrefUORM.xref_id).where(
+            XrefUORM.accession == accession,
+            XrefUORM.source_id == source_id,
+            XrefUORM.species_id == species_id,
+        )
+
+        for row in dbi.execute(query).mappings().all():
+            xref_ids.append(row.xref_id)
+
+        if len(xref_ids) > 0:
+            return xref_ids[0]
+
+        return None
+
+    def add_direct_xref(self, general_xref_id: int, ensembl_stable_id: str, ensembl_type: str, linkage_type: str, dbi: Connection) -> None:
+        """Adds data into direct xref tables in a database.
+
+        Parameters
+        ----------
+        general_xref_id: int
+            The xref ID related to the direct xref
+        ensembl_stable_id: str
+            The ensEMBL stable ID related to the direct xref
+        ensembl_type: str
+            The feature type (gene, transcript, or translation)
+        linkage_type: str
+            The type of link between the xref and ensEMBL feature
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        """
+        # Check if such a mapping exists yet
+        existing_xref_ids = self.get_direct_xref_id(
+            ensembl_stable_id, ensembl_type, linkage_type, dbi
+        )
+        if general_xref_id in existing_xref_ids:
+            return None
+
+        ensembl_type = ensembl_type.lower()
+        dbi.execute(
+            insert(self._direct_xref_tables[ensembl_type]).values(
+                general_xref_id=general_xref_id,
+                ensembl_stable_id=ensembl_stable_id,
+                linkage_xref=linkage_type,
+            )
+        )
+
+    def add_to_direct_xrefs(self, args: Dict[str, Any], dbi: Connection) -> None:
+        """Adds direct xref data into both the xref table and direct xref tables in a database.
+        This calls the functions add_xref and add_direct_xref.
+
+        Parameters
+        ----------
+        args: dict
+            The direct xref arguments. These include:
+            - stable_id: The ensEMBL feature stable ID
+            - ensembl_type: The feature type (gene, transcript, or translation)
+            - accession: The xref accession
+            - source_id: The xref source ID
+            - species_id: The species ID
+            - version (optional): The xref version (default is 0)
+            - label (optional): The xref label (default is the xref accession)
+            - description (optional): The xref description
+            - linkage (optional): The type of link between the xref and ensEMBL
+            - info_text (optional): Additional info related to the xref (default is empty string)
+            - info_type (optional): The type of xref being added (default is DIRECT)
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        """
+        stable_id = args["stable_id"]
+        ensembl_type = args["ensembl_type"]
+        accession = args["accession"]
+        source_id = args["source_id"]
+        species_id = args["species_id"]
+        version = args.get("version", 0)
+        label = args.get("label", accession)
+        description = args.get("description")
+        linkage = args.get("linkage")
+        info_text = args.get("info_text", "")
+
+        args["info_type"] = args.get("info_type", "DIRECT")
+
+        # If the accession already has an xref find it else cretae a new one
+        direct_xref_id = self.add_xref(args, dbi)
+        self.add_direct_xref(direct_xref_id, stable_id, ensembl_type, linkage, dbi)
+
+    def get_direct_xref_id(self, stable_id: str, ensembl_type: str, link: str, dbi: Connection) -> int:
+        """Retrieves the direct xref row ID from stable ID, ensEMBL type and linkage type.
+
+        Parameters
+        ----------
+        stable_id: str
+            The ensEMBL feature stable ID
+        ensembl_type: str
+            The feature type (gene, transcript, or translation)
+        link: str
+            The type of link between the xref and ensEMBL
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        The direct xref ID(s).
+        """
+        direct_xref_ids = []
+
+        ensembl_type = ensembl_type.lower()
+        ensembl_table = self._direct_xref_tables[ensembl_type]
+        query = select(ensembl_table.general_xref_id).where(
+            ensembl_table.ensembl_stable_id == stable_id,
+            ensembl_table.linkage_xref == link,
+        )
+
+        for row in dbi.execute(query).mappings().all():
+            direct_xref_ids.append(row.general_xref_id)
+
+        return direct_xref_ids
+
+    def add_xref(self, args: Dict[str, Any], dbi: Connection, update_label_desc: bool = False) -> int:
+        """Adds data into xref table in a database and returns the xref ID.
+        This function first checks if an xref already exists with the provided data.
+
+        Parameters
+        ----------
+        args: dict
+            The direct xref arguments. These include:
+            - accession: The xref accession
+            - source_id: The xref source ID
+            - species_id: The species ID
+            - label (optional): The xref label (default is the xref accession)
+            - description (optional): The xref description
+            - version (optional): The xref version (default is 0)
+            - info_type (optional): The type of xref being added (default is MISC)
+            - info_text (optional): Additional info related to the xref (default is empty string)
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        update_label_desc: bool, optional
+            If set to True, the xref label and description will be updated even if the xref data already exists in the database (default is False)
+
+        Returns
+        -------
+        The xref ID (existing or newly added).
+        """
+        accession = args["accession"]
+        source_id = args["source_id"]
+        species_id = args["species_id"]
+        label = args.get("label", accession)
+        description = args.get("description")
+        version = args.get("version", 0)
+        info_type = args.get("info_type", "MISC")
+        info_text = args.get("info_text", "")
+
+        # If the description is more than 255 characters, chop it off and add
+        # an indication that it has been truncated to the end of it.
+        if description and len(description) > 255:
+            description = description[0:249] + " /.../"
+
+        # See if it already exists. If so return the xref_id for this one.
+        xref_id = self.get_xref_id(accession, source_id, species_id, dbi)
+        if xref_id:
+            if update_label_desc:
+                if label:
+                    dbi.execute(
+                        update(XrefUORM)
+                        .where(XrefUORM.xref_id == xref_id)
+                        .values(label=label)
+                    )
+                if description:
+                    dbi.execute(
+                        update(XrefUORM)
+                        .where(XrefUORM.xref_id == xref_id)
+                        .values(description=description)
+                    )
+            return xref_id
+
+        # Add new xref
+        dbi.execute(
+            insert(XrefUORM).values(
+                accession=accession,
+                version=version,
+                label=label,
+                description=description,
+                source_id=source_id,
+                species_id=species_id,
+                info_type=info_type,
+                info_text=info_text,
+            )
+        )
+
+        xref_id = self.get_xref_id(accession, source_id, species_id, dbi)
+        return xref_id
+
+    def add_dependent_xref(self, args: Dict[str, Any], dbi: Connection) -> int:
+        """Adds data into the xref table and dependent xref table in a database.
+
+        Parameters
+        ----------
+        args: dict
+            The direct xref arguments. These include:
+            - master_xref_id: The main xref ID which the dependent xref is dependent on
+            - accession: The dependent xref accession
+            - source_id: The dependent xref source ID
+            - species_id: The species ID
+            - version (optional): The dependent xref version (default is 0)
+            - label (optional): The dependent xref label (default is the dependent xref accession)
+            - description (optional): The dependent xref description
+            - linkage (optional): The source ID of the main xref which the dependent xref id dependent on
+            - info_text (optional): Additional info related to the dependent xref (default is empty string)
+            - info_type (optional): The type of xref being added (default is DEPENDENT)
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+
+        Returns
+        -------
+        The dependent xref ID.
+        """
+        master_xref_id = args["master_xref_id"]
+        accession = args["accession"]
+        source_id = args["source_id"]
+        species_id = args["species_id"]
+        version = args.get("version", 0)
+        label = args.get("label", accession)
+        description = args.get("description")
+        linkage = args.get("linkage")
+        info_text = args.get("info_text", "")
+
+        args["info_type"] = args.get("info_type", "DEPENDENT")
+
+        # If the accession already has an xref find it else cretae a new one
+        dependent_xref_id = self.add_xref(args, dbi)
+        self.add_dependent_xref_maponly(
+            dependent_xref_id, source_id, master_xref_id, linkage, dbi
+        )
+
+        return dependent_xref_id
+
+    def add_dependent_xref_maponly(self, dependent_id: int, dependent_source_id: int, master_id: int, master_source_id: int, dbi: Connection, update_info_type: bool = False) -> None:
+        """Adds data into the dependent xref table in a database.
+        This function only adds the dependent connection if it hasn't been added before (from a cache).
+
+        Parameters
+        ----------
+        dependent_id: int
+            The dependent xref ID
+        dependent_source_id: int
+            The source ID of the dependent xref
+        master_id: int
+            The master xref ID
+        master_source_id: int
+            The source ID of the master xref
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        update_info_type: bool, optional
+            If set to True, the info_type column of the xref table related to the dependent xref will be updated to 'DEPENDENT' (default is False)
+        """
+        index = f"{master_id}|{dependent_id}"
+        if (
+            not self._xref_dependent_mapped.get(index)
+            or self._xref_dependent_mapped[index] != master_source_id
+        ):
+            dbi.execute(
+                insert(DependentXrefUORM)
+                .values(
+                    master_xref_id=master_id,
+                    dependent_xref_id=dependent_id,
+                    linkage_annotation=master_source_id,
+                    linkage_source_id=dependent_source_id,
+                )
+                .prefix_with("IGNORE")
+            )
+
+            self._xref_dependent_mapped[index] = master_source_id
+
+        if update_info_type:
+            self._update_xref_info_type(dependent_id, "DEPENDENT", dbi)
+
+    def _update_xref_info_type(self, xref_id: int, info_type: str, dbi: Connection) -> None:
+        """Updates the info_type column of the xref table.
+
+        Parameters
+        ----------
+        xref_id: int
+            The xref ID
+        info_type: str
+            The info type value to update
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        """
+        dbi.execute(
+            update(XrefUORM)
+            .where(XrefUORM.xref_id == xref_id)
+            .values(info_type=info_type)
+        )
+
+    def get_xref_sources(self, dbi: Connection) -> Dict[str, int]:
+        """Retrieves the xref source names and ID from a database.
+
+        Parameters
+        ----------
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'source_name' : 'source_ID'} items.
+        """
+        sourcename_to_sourceid = {}
+
+        query = select(SourceUORM.name, SourceUORM.source_id)
+
+        for row in dbi.execute(query).mappings().all():
+            sourcename_to_sourceid[row.name] = row.source_id
+
+        return sourcename_to_sourceid
+
+    def add_synonym(self, xref_id: int, synonym: str, dbi: Connection) -> None:
+        """Adds synonym data into the synonym table if a database.
+
+        Parameters
+        ----------
+        xref_id: int
+            The xref ID related to the synonym
+        synonym: str
+            The xref synonym
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        """
+        dbi.execute(
+            insert(SynonymORM)
+            .values(xref_id=xref_id, synonym=synonym)
+            .prefix_with("IGNORE")
+        )
+
+    def get_ext_synonyms(self, source_name: str, dbi: Connection) -> Dict[str, List[str]]:
+        """Retrieves the list of synonyms for a specific xref source.
+
+        Parameters
+        ----------
+        source_name: str
+            The xref source name
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'accession' or 'label' : [list of synonyms]} items.
+        """
+        ext_syns = {}
+        seen = {}
+        separator = ":"
+
+        query = (
+            select(XrefUORM.accession, XrefUORM.label, SynonymORM.synonym)
+            .where(
+                XrefUORM.xref_id == SynonymORM.xref_id,
+                SourceUORM.source_id == XrefUORM.source_id,
+            )
+            .filter(SourceUORM.name.like(source_name))
+        )
+
+        count = 0
+        for row in dbi.execute(query).mappings().all():
+            acc_syn = row.accession + separator + row.synonym
+            if not seen.get(acc_syn):
+                ext_syns.setdefault(row.accession, []).append(row.synonym)
+                ext_syns.setdefault(row.label, []).append(row.synonym)
+                count += 1
+
+            seen[acc_syn] = 1
+
+        return ext_syns
+
+    def build_dependent_mappings(self, source_id: int, dbi: Connection) -> None:
+        """Builds the dependent mappings cache for a specific xref source.
+        The resulting cache is a dict variable containing {'master_xref_id|dependent_xref_id' : 'linkage_annotation'} items.
+
+        Parameters
+        ----------
+        source_id: int
+            The xref source ID
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+        """
+        query = select(
+            DependentXrefUORM.master_xref_id,
+            DependentXrefUORM.dependent_xref_id,
+            DependentXrefUORM.linkage_annotation,
+        ).where(
+            DependentXrefUORM.dependent_xref_id == XrefUORM.xref_id,
+            XrefUORM.source_id == source_id,
+        )
+
+        for row in dbi.execute(query).mappings().all():
+            self._xref_dependent_mapped[
+                row.master_xref_id + "|" + row.dependent_xref_id
+            ] = row.linkage_annotation
+
+    def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]:
+        """Retrieves the xref accessions and IDs related to a specific xref source and species from a database.
+
+        Parameters
+        ----------
+        source_name: str
+            The xref source name
+        species_id: int
+            The species ID
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'accession' : [list of xref IDs]} items.
+        """
+        valid_codes = {}
+        sources = []
+
+        big_name = "%" + source_name.upper() + "%"
+        query = select(SourceUORM.source_id).filter(
+            func.upper(SourceUORM.name).like(big_name)
+        )
+        for row in dbi.execute(query).fetchall():
+            sources.append(row[0])
+
+        for source_id in sources:
+            query = select(XrefUORM.accession, XrefUORM.xref_id).where(
+                XrefUORM.species_id == species_id, XrefUORM.source_id == source_id
+            )
+            for row in dbi.execute(query).fetchall():
+                valid_codes.setdefault(row[0], []).append(row[1])
+
+        return valid_codes
+
+    def is_file_header_valid(self, columns_count: int, field_patterns: List[str], header: List[str], case_sensitive: bool = False) -> bool:
+        """Checks whether the provided file header is valid by checking length and column patterns.
+
+        Parameters
+        ----------
+        columns_count: int
+            The number of columns to be expected in the header
+        field_patterns: list
+            The column patterns for the header to satisfy
+        header: list
+            The file header to check
+        case_sensitive: bool, optional
+            If set to True, header fieled will be parsed as is, as opposed to lower-case (default is False)
+
+        Returns
+        -------
+        1 is the header is valid.
+        0 if the header is not valid.
+        """
+        # Check number of columns
+        if len(header) < columns_count:
+            return False
+
+        # Check column patterns
+        for pattern in field_patterns:
+            header_field = header.pop(0)
+            if not case_sensitive:
+                header_field = header_field.lower()
+            if pattern and not re.search(pattern, header_field):
+                return False
+
+        # If we have made it this far, all should be in order
+        return True
+
+    def add_to_syn(self, accession: str, source_id: int, synonym: str, species_id: int, dbi: Connection) -> None:
+        """Add synomyn data for an xref given its accession and source ID.
+
+        Parameters
+        ----------
+        accession: str
+            The xref accession
+        source_id: int
+            The xref source ID
+        synonym: str
+            The xref synonym
+        species_id: int
+            The species ID
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+
+        Raises
+        ------
+        KeyError
+            If no xref is found for accession, source ID, and species ID.
+        """
+        xref_id = self.get_xref_id(accession, source_id, species_id, dbi)
+        if xref_id:
+            self.add_synonym(xref_id, synonym, dbi)
+        else:
+            raise KeyError(
+                f"Could not find acc {accession} in xref table source = {source_id} of species {species_id}"
+            )
+
+    def add_to_syn_for_mult_sources(self, accession: str, sources: List[int], synonym: str, species_id: int, dbi: Connection) -> None:
+        """Adds synonym data for multiple sources.
+
+        Parameters
+        ----------
+        accession: str
+            The xref accession
+        sources: list
+            List of xref sources to add synonyms for
+        synonym: str
+            The xref synonym
+        species_id: int
+            The species ID
+        dbi: sqlalchemy.engine.Connection
+            The database connection to update in
+        """
+        for source_id in sources:
+            xref_id = self.get_xref_id(accession, source_id, species_id, dbi)
+            if xref_id:
+                self.add_synonym(xref_id, synonym, dbi)
+
+    def species_id_to_names(self, dbi: Connection) -> Dict[int, List[str]]:
+        """Creates a dictionary that contains the name and aliases for every species ID.
+
+        Parameters
+        ----------
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'species_id' : [list of names/synonyms]} items.
+        """
+        id_to_names = {}
+
+        # Query the species table
+        query = select(SpeciesORM.species_id, SpeciesORM.name)
+        for row in dbi.execute(query).mappings().all():
+            id_to_names[row.species_id] = [row.name]
+
+        # Also populate the dict with all the aliases
+        query = select(SpeciesORM.species_id, SpeciesORM.aliases)
+        for row in dbi.execute(query).mappings().all():
+            for name in re.split(r",\s*", row.aliases, flags=re.MULTILINE | re.DOTALL):
+                id_to_names.setdefault(row.species_id, []).append(name)
+
+        return id_to_names
+
+    def species_id_to_taxonomy(self, dbi: Connection) -> Dict[int, List[int]]:
+        """Creates a dictionary that contains the taxonomy IDs for every species ID.
+
+        Parameters
+        ----------
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'species_id' : [list of taxonomy IDs]} items.
+        """
+        id_to_taxonomy = {}
+
+        # Query the species table
+        query = select(SpeciesORM.species_id, SpeciesORM.taxonomy_id)
+        for row in dbi.execute(query).mappings().all():
+            id_to_taxonomy.setdefault(row.species_id, []).append(row.taxonomy_id)
+
+        return id_to_taxonomy
+
+    def get_valid_xrefs_for_dependencies(self, dependent_name: str, reverse_ordered_source_list: List[str], dbi: Connection) -> Dict[str, int]:
+        """Get a hash to go from accession of a dependent xref to master_xref_id for all of source names given.
+
+        Parameters
+        ----------
+        dependent_name: str
+            The dependent source name
+        reverse_ordered_source_list: list
+            List of source names
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'accession' : 'master_xred_id'} items.
+        """
+        dependent_2_xref = {}
+        dependent_sources = []
+        sources = []
+
+        query = select(SourceUORM.source_id).where(
+            func.lower(SourceUORM.name) == dependent_name.lower()
+        )
+        for row in dbi.execute(query).fetchall():
+            dependent_sources.append(row[0])
+
+        for name in reverse_ordered_source_list:
+            query = select(SourceUORM.source_id).where(
+                func.lower(SourceUORM.name) == name.lower()
+            )
+            for row in dbi.execute(query).fetchall():
+                sources.append(row[0])
+
+        Xref1 = aliased(XrefUORM)
+        Xref2 = aliased(XrefUORM)
+
+        for dependent in dependent_sources:
+            for source in sources:
+                query = select(DependentXrefUORM.master_xref_id, Xref2.accession).where(
+                    Xref1.xref_id == DependentXrefUORM.master_xref_id,
+                    Xref1.source_id == source,
+                    Xref2.xref_id == DependentXrefUORM.dependent_xref_id,
+                    Xref2.source_id == dependent,
+                )
+                for row in dbi.execute(query).fetchall():
+                    dependent_2_xref[row[1]] = row[0]
+
+        return dependent_2_xref
+
+    def get_source_ids_for_source_name_pattern(self, source_name: str, dbi: Connection) -> List[int]:
+        """Gets a set of source IDs matching a source name pattern.
+
+        Parameters
+        ----------
+        source_name: str
+            The name of the source
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A list of source IDs.
+        """
+        big_name = "%" + source_name.upper() + "%"
+        sources = []
+
+        query = select(SourceUORM.source_id).where(
+            func.upper(SourceUORM.name).like(big_name)
+        )
+        for row in dbi.execute(query).fetchall():
+            sources.append(row[0])
+
+        return sources
+
+    def get_acc_to_label(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, str]:
+        """Creates a hash that uses the accession as a key and the label as the value.
+
+        Parameters
+        ----------
+        source_name: str
+            The name of the source
+        species_id: int
+            The species ID
+        dbi: sqlalchemy.engine.Connection
+            The database connection to query in
+
+        Returns
+        -------
+        A dict variable containing {'accession' : 'label'} items.
+        """
+        acc_to_label = {}
+
+        source_name = source_name + "%"
+        query = select(XrefUORM.accession, XrefUORM.label).where(
+            XrefUORM.source_id == SourceUORM.source_id,
+            SourceUORM.name.like(source_name),
+            XrefUORM.species_id == species_id,
+        )
+        for row in dbi.execute(query).mappings().all():
+            acc_to_label[row.accession] = row.label
+
+        return acc_to_label
+
+    def extract_params_from_string(self, string: str, parameters: List[str]) -> List[str]:
+        values = []
+
+        for param in parameters:
+            val = None
+
+            match = re.search(param + r"[=][>](\S+?)[,]", string)
+            if match:
+                val = match.group(1)
+
+            values.append(val)
+
+        return values
diff --git a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
new file mode 100644
index 000000000..f2e258716
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
@@ -0,0 +1,101 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for CCDS source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class CCDSParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        dba        = args["dba"]
+        xref_dbi   = args["xref_dbi"]
+        verbose    = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Extract db connection parameters from file
+        db_user = "ensro"
+        db_host, db_port, db_name, db_pass = self.extract_params_from_string(
+            file, ["host", "port", "dbname", "pass"]
+        )
+        if not db_port:
+            db_port = "3306"
+
+        # Connect to the appropriate db
+        if db_host:
+            ccds_db_url = URL.create(
+                "mysql", db_user, db_pass, db_host, db_port, db_name
+            )
+        elif dba:
+            ccds_db_url = dba
+
+        if not ccds_db_url:
+            return 1, "Could not find CCDS DB."
+        else:
+            if verbose:
+                logging.info(f"Found CCDS DB: {ccds_db_url}")
+
+        # Get data from ccds db
+        db_engine = self.get_db_engine(ccds_db_url)
+        with db_engine.connect() as ccds_dbi:
+            query = (
+                select(TranscriptORM.stable_id, XrefCORM.dbprimary_acc)
+                .where(
+                    XrefCORM.xref_id == ObjectXrefCORM.xref_id,
+                    ObjectXrefCORM.ensembl_object_type == "Transcript",
+                    ObjectXrefCORM.ensembl_id == TranscriptORM.transcript_id,
+                    ExternalDbORM.external_db_id == XrefCORM.external_db_id,
+                )
+                .filter(ExternalDbORM.db_name.like("Ens_%_transcript"))
+            )
+            result = ccds_dbi.execute(query).mappings().all()
+
+        xref_count, direct_count = 0, 0
+        seen = {}
+
+        for row in result:
+            stable_id = row.stable_id
+            display_label = row.dbprimary_acc
+
+            (acc, version) = display_label.split(".")
+
+            if not seen.get(display_label):
+                xref_id = self.add_xref(
+                    {
+                        "accession": acc,
+                        "version": version,
+                        "label": display_label,
+                        "source_id": source_id,
+                        "species_id": species_id,
+                        "info_type": "DIRECT",
+                    },
+                    xref_dbi,
+                )
+
+                xref_count += 1
+                seen[display_label] = xref_id
+            else:
+                xref_id = seen[display_label]
+
+            self.add_direct_xref(xref_id, stable_id, "Transcript", "", xref_dbi)
+            direct_count += 1
+
+        result_message = f"Parsed CCDS identifiers from {file}, added {xref_count} xrefs and {direct_count} direct_xrefs"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/DBASSParser.py b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py
new file mode 100644
index 000000000..9f3f6243a
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py
@@ -0,0 +1,114 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for DBASS sources."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+EXPECTED_NUMBER_OF_COLUMNS = 23
+
+
+class DBASSParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file  = args.get("file")
+        xref_dbi   = args.get("xref_dbi")
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Need to pass source_id, species_id and file")
+
+        file_io = self.get_filehandle(xref_file)
+        csv_reader = csv.reader(file_io)
+
+        # Check if header is valid
+        header = next(csv_reader)
+        patterns = [r"^id$", r"^genesymbol$", None, r"^ensemblreference$"]
+        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+            raise IOError(f"Malformed or unexpected header in DBASS file {xref_file}")
+
+        processed_count = 0
+        unmapped_count = 0
+
+        # Read lines
+        for line in csv_reader:
+            if not line:
+                continue
+
+            if len(line) < EXPECTED_NUMBER_OF_COLUMNS:
+                line_number = 2 + processed_count + unmapped_count
+                raise IOError(
+                    f"Line {line_number} of input file {xref_file} has an incorrect number of columns"
+                )
+
+            dbass_gene_id = line[0]
+            dbass_gene_name = line[1]
+            dbass_full_name = line[2]
+            ensembl_id = line[3]
+
+            # Do not attempt to create unmapped xrefs. Checking truthiness is good
+            # enough here because the only non-empty string evaluating as false is
+            # not a valid Ensembl stable ID.
+            if ensembl_id:
+                # DBASS files list synonyms in two ways: either "FOO (BAR)" (with or
+                # without space) or "FOO/BAR". Both forms are relevant to us.
+                match = re.search(
+                    r"(.*)\s?/\s?(.*)", dbass_gene_name, re.IGNORECASE | re.DOTALL
+                )
+                if match:
+                    first_gene_name = match.group(1)
+                    second_gene_name = match.group(2)
+                else:
+                    match = re.search(
+                        r"(.*)\s?\((.*)\)", dbass_gene_name, re.IGNORECASE | re.DOTALL
+                    )
+                    if match:
+                        first_gene_name = match.group(1)
+                        second_gene_name = match.group(2)
+                    else:
+                        first_gene_name = dbass_gene_name
+                        second_gene_name = None
+
+                label = first_gene_name
+                synonym = second_gene_name
+                ensembl_type = "gene"
+                version = "1"
+
+                xref_id = self.add_xref(
+                    {
+                        "accession": dbass_gene_id,
+                        "version": version,
+                        "label": label,
+                        "source_id": source_id,
+                        "species_id": species_id,
+                        "info_type": "DIRECT",
+                    },
+                    xref_dbi,
+                )
+
+                if synonym:
+                    self.add_synonym(xref_id, synonym, xref_dbi)
+
+                self.add_direct_xref(xref_id, ensembl_id, ensembl_type, "", xref_dbi)
+
+                processed_count += 1
+            else:
+                unmapped_count += 1
+
+        file_io.close()
+
+        result_message = f"{processed_count} direct xrefs successfully processed\n"
+        result_message += f"Skipped {unmapped_count} unmapped xrefs"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
new file mode 100644
index 000000000..33a7328a2
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
@@ -0,0 +1,120 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for EntrezGene and WikiGene sources."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+EXPECTED_NUMBER_OF_COLUMNS = 16
+
+
+class EntrezGeneParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+        verbose    = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi)
+        if verbose:
+            logging.info(f"Wiki source id = {wiki_source_id}")
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(file_io, delimiter="\t")
+
+        # Check if header is valid
+        header = next(csv_reader)
+        patterns = [
+            r"\A[#]?\s*tax_id",
+            "geneid",
+            "symbol",
+            "locustag",
+            "synonyms",
+            "dbxrefs",
+            "chromosome",
+            "map_location",
+            "description",
+            "type_of_gene",
+            "symbol_from_nomenclature_authority",
+            "full_name_from_nomenclature_authority",
+            "nomenclature_status",
+            "other_designations",
+            "modification_date",
+            "feature_type",
+        ]
+        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+            raise IOError(f"Malformed or unexpected header in EntrezGene file {file}")
+
+        xref_count = 0
+        syn_count = 0
+        seen = {}
+
+        # Read lines
+        for line in csv_reader:
+            if not line:
+                continue
+
+            tax_id = line[0]
+            acc = line[1]
+            symbol = line[2]
+            synonyms = line[4]
+            desc = line[8]
+
+            if tax_id != species_id:
+                continue
+            if seen.get(acc):
+                continue
+
+            xref_id = self.add_xref(
+                {
+                    "accession": acc,
+                    "label": symbol,
+                    "description": desc,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "DEPENDENT",
+                },
+                xref_dbi,
+            )
+            self.add_xref(
+                {
+                    "accession": acc,
+                    "label": symbol,
+                    "description": desc,
+                    "source_id": wiki_source_id,
+                    "species_id": species_id,
+                    "info_type": "DEPENDENT",
+                },
+                xref_dbi,
+            )
+
+            xref_count += 1
+
+            syns = re.split(r"\|", synonyms)
+            for synonym in syns:
+                if synonym != "-":
+                    self.add_synonym(xref_id, synonym, xref_dbi)
+                    syn_count += 1
+
+            seen[acc] = 1
+
+        file_io.close()
+
+        result_message = f"{xref_count} EntrezGene Xrefs and {xref_count} WikiGene Xrefs added with {syn_count} synonyms"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
new file mode 100644
index 000000000..9bcda9cbd
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
@@ -0,0 +1,421 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for HGNC source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+from unidecode import unidecode
+import codecs
+
+
+class HGNCParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        dba        = args["dba"]
+        xref_dbi   = args["xref_dbi"]
+        verbose    = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Parse the file string and set default user
+        file_params = self.parse_file_string(file)
+        if not file_params.get("user"):
+            file_params["user"] = "ensro"
+
+        # Prepare lookup lists
+        swissprot = self.get_valid_codes("Uniprot/SWISSPROT", species_id, xref_dbi)
+        refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+        source_list = ["refseq_peptide", "refseq_mRNA"]
+        entrezgene = self.get_valid_xrefs_for_dependencies(
+            "EntrezGene", source_list, xref_dbi
+        )
+
+        # Prepare sources
+        self_source_name = self.get_source_name_for_source_id(source_id, xref_dbi)
+        source_ids = {
+            "ccds": self.get_source_id_for_source_name(
+                self_source_name, xref_dbi, "ccds"
+            ),
+            "entrezgene_manual": self.get_source_id_for_source_name(
+                self_source_name, xref_dbi, "entrezgene_manual"
+            ),
+            "refseq_manual": self.get_source_id_for_source_name(
+                self_source_name, xref_dbi, "refseq_manual"
+            ),
+            "ensembl_manual": self.get_source_id_for_source_name(
+                self_source_name, xref_dbi, "ensembl_manual"
+            ),
+            "desc_only": self.get_source_id_for_source_name(
+                self_source_name, xref_dbi, "desc_only"
+            ),
+            "lrg": self.get_source_id_for_source_name("LRG_HGNC_notransfer", xref_dbi),
+            "genecards": self.get_source_id_for_source_name("GeneCards", xref_dbi),
+        }
+
+        # Statistics counts
+        name_count = {
+            "ccds": 0,
+            "lrg": 0,
+            "ensembl_manual": 0,
+            "genecards": 0,
+            "refseq_manual": 0,
+            "entrezgene_manual": 0,
+        }
+        mismatch = 0
+
+        # Connect to the ccds db
+        ccds_db_url = None
+        if dba:
+            ccds_db_url = dba
+        elif file_params.get("host"):
+            ccds_db_url = URL.create(
+                "mysql",
+                file_params["user"],
+                file_params["pass"],
+                file_params["host"],
+                file_params["port"],
+                file_params["dbname"],
+            )
+        else:
+            raise AttributeError("No ensembl ccds database provided")
+
+        if not ccds_db_url:
+            raise AttributeError("No ensembl ccds database provided")
+        else:
+            if verbose:
+                logging.info(f"Found ccds DB: {ccds_db_url}")
+
+        # Get CCDS data
+        db_engine = self.get_db_engine(ccds_db_url)
+        with db_engine.connect() as ccds_dbi:
+            query = (
+                select(TranscriptAttribORM.value, TranscriptORM.stable_id)
+                .join(
+                    TranscriptAttribORM,
+                    TranscriptORM.transcript_id == TranscriptAttribORM.transcript_id,
+                )
+                .join(
+                    AttribTypeORM,
+                    TranscriptAttribORM.attrib_type_id == AttribTypeORM.attrib_type_id,
+                )
+                .where(AttribTypeORM.code == "ccds_transcript")
+            )
+            result = ccds_dbi.execute(query).mappings().all()
+
+        ccds_to_ens = {}
+        for row in result:
+            # Remove version
+            ccds_id = re.sub(r"\.\d+", "", row.value)
+
+            ccds_to_ens[ccds_id] = row.stable_id
+
+        # Get HGNC file (wget or disk)
+        mem_file = file
+        if file_params.get("wget"):
+            response = requests.get(file_params["wget"])
+            if not response.ok:
+                raise IOError(response.reason)
+            mem_file = response.text
+
+        # Make sure the file is utf8
+        mem_file = codecs.encode(mem_file, "utf-8").decode("utf-8")
+        mem_file = re.sub(r'"', '', mem_file)
+
+        file_io = self.get_filehandle(mem_file)
+        csv_reader = csv.DictReader(file_io, delimiter="\t")
+
+        # Read lines
+        for line in csv_reader:
+            accession = line["HGNC ID"]
+            symbol = line["Approved symbol"]
+            name = line["Approved name"]
+            previous_symbols = line["Previous symbols"]
+            synonyms = line["Alias symbols"]
+
+            seen = 0
+
+            # Direct CCDS to ENST mappings
+            ccds = line["CCDS IDs"]
+            ccds_list = []
+            if ccds:
+                ccds_list = re.split(r",\s", ccds)
+
+            for ccds in ccds_list:
+                enst_id = ccds_to_ens.get(ccds)
+                if not enst_id:
+                    continue
+
+                self.add_to_direct_xrefs(
+                    {
+                        "stable_id": enst_id,
+                        "ensembl_type": "gene",
+                        "accession": accession,
+                        "label": symbol,
+                        "description": name,
+                        "source_id": source_ids["ccds"],
+                        "species_id": species_id,
+                    },
+                    xref_dbi,
+                )
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_ids["ccds"],
+                        "name": accession,
+                        "species_id": species_id,
+                        "dead": previous_symbols,
+                        "alias": synonyms,
+                    },
+                    xref_dbi,
+                )
+
+                name_count["ccds"] += 1
+
+            # Direct LRG to ENST mappings
+            lrg_id = line["Locus specific databases"]
+            if lrg_id:
+                match = re.search(r"(LRG_\d+)\|", lrg_id)
+                if match:
+                    lrg_id = match.group(1)
+
+                    self.add_to_direct_xrefs(
+                        {
+                            "stable_id": lrg_id,
+                            "ensembl_type": "gene",
+                            "accession": accession,
+                            "label": symbol,
+                            "description": name,
+                            "source_id": source_ids["lrg"],
+                            "species_id": species_id,
+                        },
+                        xref_dbi,
+                    )
+                    self.add_synonyms_for_hgnc(
+                        {
+                            "source_id": source_ids["lrg"],
+                            "name": accession,
+                            "species_id": species_id,
+                            "dead": previous_symbols,
+                            "alias": synonyms,
+                        },
+                        xref_dbi,
+                    )
+
+                    name_count["lrg"] += 1
+
+            # Direct Ensembl mappings
+            ensg_id = line["Ensembl gene ID"]
+            if ensg_id:
+                seen = 1
+
+                self.add_to_direct_xrefs(
+                    {
+                        "stable_id": ensg_id,
+                        "ensembl_type": "gene",
+                        "accession": accession,
+                        "label": symbol,
+                        "description": name,
+                        "source_id": source_ids["ensembl_manual"],
+                        "species_id": species_id,
+                    },
+                    xref_dbi,
+                )
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_ids["ensembl_manual"],
+                        "name": accession,
+                        "species_id": species_id,
+                        "dead": previous_symbols,
+                        "alias": synonyms,
+                    },
+                    xref_dbi,
+                )
+
+                name_count["ensembl_manual"] += 1
+
+                # GeneCards
+                direct_id = self.get_xref_id(
+                    accession, source_ids["ensembl_manual"], species_id, xref_dbi
+                )
+                hgnc_id = re.search(r"HGNC:(\d+)", accession).group(1)
+
+                self.add_dependent_xref(
+                    {
+                        "master_xref_id": direct_id,
+                        "accession": hgnc_id,
+                        "label": symbol,
+                        "description": name,
+                        "source_id": source_ids["genecards"],
+                        "species_id": species_id,
+                    },
+                    xref_dbi,
+                )
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_ids["genecards"],
+                        "name": hgnc_id,
+                        "species_id": species_id,
+                        "dead": previous_symbols,
+                        "alias": synonyms,
+                    },
+                    xref_dbi,
+                )
+
+                name_count["genecards"] += 1
+
+            # RefSeq
+            refseq_id = line["RefSeq IDs"]
+            if refseq_id and refseq.get(refseq_id):
+                seen = 1
+
+                for xref_id in refseq[refseq_id]:
+                    self.add_dependent_xref(
+                        {
+                            "master_xref_id": xref_id,
+                            "accession": accession,
+                            "label": symbol,
+                            "description": name,
+                            "source_id": source_ids["refseq_manual"],
+                            "species_id": species_id,
+                        },
+                        xref_dbi,
+                    )
+                    name_count["refseq_manual"] += 1
+
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_ids["refseq_manual"],
+                        "name": accession,
+                        "species_id": species_id,
+                        "dead": previous_symbols,
+                        "alias": synonyms,
+                    },
+                    xref_dbi,
+                )
+
+            # EntrezGene
+            entrez_id = line["NCBI Gene ID"]
+            if entrez_id and entrezgene.get(entrez_id):
+                seen = 1
+
+                self.add_dependent_xref(
+                    {
+                        "master_xref_id": entrezgene[entrez_id],
+                        "accession": accession,
+                        "label": symbol,
+                        "description": name,
+                        "source_id": source_ids["entrezgene_manual"],
+                        "species_id": species_id,
+                    },
+                    xref_dbi,
+                )
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_ids["entrezgene_manual"],
+                        "name": accession,
+                        "species_id": species_id,
+                        "dead": previous_symbols,
+                        "alias": synonyms,
+                    },
+                    xref_dbi,
+                )
+
+                name_count["entrezgene_manual"] += 1
+
+            # Store to keep descriptions if not stored yet
+            if not seen:
+                xref_id = self.add_xref(
+                    {
+                        "accession": accession,
+                        "label": symbol,
+                        "description": name,
+                        "source_id": source_ids["desc_only"],
+                        "species_id": species_id,
+                        "info_type": "MISC",
+                    },
+                    xref_dbi,
+                )
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_ids["desc_only"],
+                        "name": accession,
+                        "species_id": species_id,
+                        "dead": previous_symbols,
+                        "alias": synonyms,
+                    },
+                    xref_dbi,
+                )
+                mismatch += 1
+
+        file_io.close()
+
+        result_message = "HGNC xrefs loaded:\n"
+        for count_type, count in name_count.items():
+            result_message += f"\t{count_type}\t{count}\n"
+        result_message += f"{mismatch} HGNC ids could not be associated in xrefs"
+
+        return 0, result_message
+
+    def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None:
+        source_id    = args["source_id"]
+        name         = args["name"]
+        species_id   = args["species_id"]
+        dead_string  = args.get("dead")
+        alias_string = args.get("alias")
+
+        # Dead name, add to synonym
+        if dead_string:
+            dead_string = re.sub('"', "", dead_string)
+            dead_array = re.split(r",\s", dead_string)
+
+            for dead in dead_array:
+                try:
+                    dead = dead.decode("utf-8")
+                except:
+                    pass
+                dead = unidecode(dead.upper())
+
+                self.add_to_syn(name, source_id, dead, species_id, dbi)
+
+        # Alias name, add to synonym
+        if alias_string:
+            alias_string = re.sub('"', "", alias_string)
+            alias_array = re.split(r",\s", alias_string)
+
+            for alias in alias_array:
+                try:
+                    alias = alias.decode("utf-8")
+                except:
+                    pass
+                alias = unidecode(alias.upper())
+
+                self.add_to_syn(name, source_id, alias, species_id, dbi)
+
+    def parse_file_string(self, file_string: str) -> Dict[str, str]:
+        # file_string = re.sub(r"\A\w+:", "", file_string)
+        file_string = re.sub(r"^\w+:", "", file_string)
+
+        param_pairs = file_string.split(",")
+        params = {}
+
+        # Set provided values
+        for pair in param_pairs:
+            if re.search("=>", pair):
+                key, value = pair.split("=>")
+                params[key] = value
+
+        return params
diff --git a/src/python/ensembl/production/xrefs/parsers/HPAParser.py b/src/python/ensembl/production/xrefs/parsers/HPAParser.py
new file mode 100644
index 000000000..76c99d769
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/HPAParser.py
@@ -0,0 +1,74 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for HPA source."""
+
+import csv
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
+
+EXPECTED_NUMBER_OF_COLUMNS = 4
+
+
+class HPAParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(file_io, delimiter=",", strict=True)
+
+        # Check if header is valid
+        header = next(csv_reader)
+        patterns = ["antibody", "antibody_id", "ensembl_peptide_id", "link"]
+        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+            raise IOError(f"Malformed or unexpected header in HPA file {file}")
+
+        parsed_count = 0
+
+        # Read lines
+        for line in csv_reader:
+            if not line:
+                continue
+
+            antibody_name = line[0]
+            antibody_id = line[1]
+            ensembl_id = line[2]
+
+            self.add_to_direct_xrefs(
+                {
+                    "accession": antibody_id,
+                    "version": "1",
+                    "label": antibody_name,
+                    "stable_id": ensembl_id,
+                    "ensembl_type": "translation",
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "DIRECT",
+                },
+                xref_dbi,
+            )
+
+            parsed_count += 1
+
+        file_io.close()
+
+        result_message = f"{parsed_count} direct xrefs succesfully parsed"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
new file mode 100644
index 000000000..8ce883d1d
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
@@ -0,0 +1,60 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for JGI source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+from Bio import SeqIO
+
+
+class JGI_ProteinParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        xrefs = []
+
+        file_io = self.get_filehandle(file)
+        fasta_sequences = SeqIO.parse(file_io, "fasta")
+
+        for fasta in fasta_sequences:
+            accession = fasta.id
+            sequence = fasta.seq
+
+            # Extract accession value
+            accession = re.search(r"^ci0100(\w+?)$", accession).group(1)
+
+            # Build an xref object and store it
+            xref = {
+                "ACCESSION": accession,
+                "SEQUENCE": sequence,
+                "SOURCE_ID": source_id,
+                "SPECIES_ID": species_id,
+                "SEQUENCE_TYPE": "peptide",
+            }
+            xrefs.append(xref)
+
+        file_io.close()
+
+        self.upload_xref_object_graphs(xrefs, xref_dbi)
+
+        result_message = "%d JGI_ xrefs succesfully parsed" % len(xrefs)
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MGIParser.py b/src/python/ensembl/production/xrefs/parsers/MGIParser.py
new file mode 100644
index 000000000..2508d516a
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/MGIParser.py
@@ -0,0 +1,72 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for MGI source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class MGIParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        syn_hash = self.get_ext_synonyms("MGI", xref_dbi)
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(file_io, delimiter="\t", strict=True)
+
+        count = 0
+        syn_count = 0
+
+        # Read lines
+        for line in csv_reader:
+            if not line:
+                continue
+
+            accession = line[0]
+            ensembl_id = line[5]
+
+            xref_id = self.add_xref(
+                {
+                    "accession": accession,
+                    "version": 0,
+                    "label": line[1],
+                    "description": line[2],
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "DIRECT",
+                },
+                xref_dbi,
+            )
+            self.add_direct_xref(xref_id, ensembl_id, "Gene", "", xref_dbi)
+
+            if syn_hash.get(accession):
+                for synonym in syn_hash[accession]:
+                    self.add_synonym(xref_id, synonym, xref_dbi)
+                    syn_count += 1
+
+            count += 1
+
+        file_io.close()
+
+        result_message = f"{count} direct MGI xrefs added\n"
+        result_message += f"{syn_count} synonyms added"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py
new file mode 100644
index 000000000..ae1fbb3dd
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py
@@ -0,0 +1,107 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for MGI CCDS source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class MGI_CCDS_Parser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        source_ids = []
+        labels = {}
+        versions = {}
+        descriptions = {}
+        accessions = {}
+
+        query = select(SourceUORM.source_id).filter(SourceUORM.name.like("MGI"))
+        result = xref_dbi.execute(query).fetchall()
+        for row in result:
+            source_ids.append(row[0])
+
+        query = select(
+            XrefUORM.accession, XrefUORM.label, XrefUORM.version, XrefUORM.description
+        ).filter(XrefUORM.source_id.in_(source_ids))
+
+        for row in xref_dbi.execute(query).mappings().all():
+            if row["description"]:
+                accessions[row["label"]] = row.accession
+                labels[row["accession"]] = row.label
+                versions[row["accession"]] = row.version
+                descriptions[row["accession"]] = row.description
+
+        # Get master xref ids via the ccds label
+        ccds_label_to_xref_id = {}
+        query = select(XrefUORM.label, XrefUORM.xref_id).where(
+            XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name == "CCDS"
+        )
+        result = xref_dbi.execute(query).fetchall()
+        for row in result:
+            ccds_label_to_xref_id[row[0]] = row[1]
+
+        count = 0
+        ccds_missing = 0
+        mgi_missing = 0
+
+        mgi_io = self.get_filehandle(file)
+        for line in mgi_io:
+            line = line.rstrip()
+            if not line:
+                continue
+
+            fields = line.split("\t")
+            chromosome = fields[0]
+            g_accession = fields[1]
+            gene_name = fields[2]
+            entrez_id = fields[3]
+            ccds = fields[4]
+
+            if ccds_label_to_xref_id.get(ccds):
+                if accessions.get(gene_name) and labels.get(accessions[gene_name]):
+                    accession = accessions[gene_name]
+                    self.add_dependent_xref(
+                        {
+                            "master_xref_id": ccds_label_to_xref_id[ccds],
+                            "accession": accession,
+                            "version": versions[accession],
+                            "label": labels[accession],
+                            "description": descriptions[accession],
+                            "source_id": source_id,
+                            "species_id": species_id,
+                        },
+                        xref_dbi,
+                    )
+
+                    count += 1
+                else:
+                    mgi_missing += 1
+            else:
+                ccds_missing += 1
+
+        mgi_io.close()
+
+        result_message = f"Added {count} MGI xrefs via CCDS\n"
+        result_message += (
+            f"{ccds_missing} CCDS not resolved, {mgi_missing} MGI not found"
+        )
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py
new file mode 100644
index 000000000..010298200
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py
@@ -0,0 +1,101 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for MGI Descriptions."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+EXPECTED_NUMBER_OF_COLUMNS = 12
+
+
+class MGI_Desc_Parser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+        verbose    = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(
+            file_io, delimiter="\t", strict=True, quotechar=None, escapechar=None
+        )
+
+        # Check if header is valid
+        header = next(csv_reader)
+        patterns = [
+            "mgi accession id",
+            "chr",
+            "cm position",
+            "genome coordinate start",
+            "genome coordinate end",
+            "strand",
+            "marker symbol",
+            "status",
+            "marker name",
+            "marker type",
+            "feature type",
+            r"marker\ssynonyms\s\(pipe\-separated\)",
+        ]
+        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+            raise IOError(f"Malformed or unexpected header in MGI_desc file {file}")
+
+        xref_count = 0
+        syn_count = 0
+        acc_to_xref = {}
+
+        # Read lines
+        for line in csv_reader:
+            if not line:
+                continue
+
+            accession = line[0]
+            marker = line[8]
+
+            xref_id = self.add_xref(
+                {
+                    "accession": accession,
+                    "label": line[6],
+                    "description": marker,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "MISC",
+                },
+                xref_dbi,
+            )
+            acc_to_xref[accession] = xref_id
+
+            if not marker and verbose:
+                logging.info(f"{accession} has no description")
+
+            xref_count += 1
+
+            if acc_to_xref.get(accession):
+                synonym_field = line[11]
+                if synonym_field:
+                    synonyms = re.split(r"[|]", synonym_field)
+
+                    for synonym in synonyms:
+                        self.add_synonym(xref_id, synonym, xref_dbi)
+                        syn_count += 1
+
+        file_io.close()
+
+        result_message = f"{xref_count} MGI Description Xrefs added\n"
+        result_message += f"{syn_count} synonyms added"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MIMParser.py b/src/python/ensembl/production/xrefs/parsers/MIMParser.py
new file mode 100644
index 000000000..1ae4f5952
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/MIMParser.py
@@ -0,0 +1,159 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for MIM source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class MIMParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        general_source_id = args["source_id"]
+        species_id        = args["species_id"]
+        file              = args["file"]
+        xref_dbi          = args["xref_dbi"]
+        verbose           = args.get("verbose", False)
+
+        if not general_source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        old_to_new, removed = {}, {}
+        sources = []
+
+        sources.append(general_source_id)
+
+        gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi)
+        sources.append(gene_source_id)
+        morbid_source_id = self.get_source_id_for_source_name("MIM_MORBID", xref_dbi)
+        sources.append(morbid_source_id)
+
+        TYPE_SINGLE_SOURCES = {
+            "*": gene_source_id,
+            "": morbid_source_id,
+            "#": morbid_source_id,
+            "%": morbid_source_id,
+        }
+
+        counters = {gene_source_id: 0, morbid_source_id: 0, "removed": 0, "synonyms": 0}
+
+        if verbose:
+            logging.info("Sources are: " + ", ".join(map(str, sources)))
+
+        for section in self.get_file_sections(file, "*RECORD*"):
+            if len(section) == 1:
+                continue
+
+            record = "".join(section)
+
+            # Extract the TI field
+            ti = self.extract_ti(record)
+            if not ti:
+                raise IOError("Failed to extract TI field from record")
+
+            # Extract record type
+            (record_type, number, long_desc) = self.parse_ti(ti)
+            if record_type is None:
+                raise IOError(
+                    "Failed to extract record type and description from TI field"
+                )
+
+            # Use the first block of text as description
+            fields = re.split(";;", long_desc, flags=re.MULTILINE | re.DOTALL)
+            label = fields[0]
+            label = f"{label} [{record_type}{number}]"
+
+            xref_object = {
+                "accession": number,
+                "label": label,
+                "description": long_desc,
+                "species_id": species_id,
+                "info_type": "UNMAPPED",
+            }
+
+            if TYPE_SINGLE_SOURCES.get(record_type):
+                type_source = TYPE_SINGLE_SOURCES[record_type]
+                xref_object["source_id"] = type_source
+                counters[type_source] += 1
+
+                xref_id = self.add_xref(xref_object, xref_dbi)
+            elif record_type == "+":
+                # This type means both gene and phenotype, add both
+                xref_object["source_id"] = gene_source_id
+                counters[gene_source_id] += 1
+                xref_id = self.add_xref(xref_object, xref_dbi)
+
+                xref_object["source_id"] = morbid_source_id
+                counters[morbid_source_id] += 1
+                xref_id = self.add_xref(xref_object, xref_dbi)
+            elif record_type == "^":
+                match = re.search(
+                    r"MOVED\sTO\s(\d+)", long_desc, flags=re.MULTILINE | re.DOTALL
+                )
+                if match:
+                    new_number = match.group(1)
+                    if new_number != number:
+                        old_to_new[number] = new_number
+                elif long_desc == "REMOVED FROM DATABASE":
+                    removed[number] = 1
+                    counters["removed"] += 1
+                else:
+                    raise IOError(f"Unsupported type of a '^' record: '{long_desc}'")
+
+        # Generate synonyms from "MOVED TO" entries
+        for old, new in old_to_new.items():
+            # Some entries in the MIM database have been moved multiple times
+            # Keep traversing the chain of renames until we have reached the end
+            while old_to_new.get(new):
+                new = old_to_new[new]
+
+            # Check if the entry has been removed from the database
+            if not removed.get(new):
+                self.add_to_syn_for_mult_sources(
+                    new, sources, old, species_id, xref_dbi
+                )
+                counters["synonyms"] += 1
+
+        result_message = "%d genemap and %d phenotype MIM xrefs added\n" % (
+            counters[gene_source_id],
+            counters[morbid_source_id],
+        )
+        result_message += (
+            "\t%d synonyms (defined by MOVED TO) added\n" % counters["synonyms"]
+        )
+        result_message += "\t%d entries removed" % counters["removed"]
+
+        return 0, result_message
+
+    def extract_ti(self, input_record: str) -> str:
+        ti = None
+
+        match = re.search(
+            r"[*]FIELD[*]\sTI\n(.+?)\n?(?:[*]FIELD[*]| [*]RECORD[*]| [*]THEEND[*])",
+            input_record,
+            flags=re.MULTILINE | re.DOTALL,
+        )
+        if match:
+            ti = match.group(1)
+
+        return ti
+
+    def parse_ti(self, ti: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        ti = re.sub(r"(?:;;\n|\n;;)", ";;", ti, flags=re.MULTILINE | re.DOTALL)
+        ti = re.sub(r"\n", "", ti, flags=re.MULTILINE | re.DOTALL)
+
+        match = re.search(r"\A([#%+*^]*)(\d+)\s+(.+)", ti)
+        if match:
+            return match.group(1), match.group(2), match.group(3)
+
+        return None, None, None
diff --git a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
new file mode 100644
index 000000000..6c7688889
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
@@ -0,0 +1,170 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for MIM to Gene source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+EXPECTED_NUMBER_OF_COLUMNS = 6
+
+
+class Mim2GeneParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        general_source_id = args["source_id"]
+        species_id        = args["species_id"]
+        file              = args["file"]
+        xref_dbi          = args["xref_dbi"]
+        verbose           = args.get("verbose", False)
+
+        if not general_source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Get needed source IDs
+        mim_gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi)
+        mim_morbid_source_id = self.get_source_id_for_source_name(
+            "MIM_MORBID", xref_dbi
+        )
+        entrez_source_id = self.get_source_id_for_source_name("EntrezGene", xref_dbi)
+
+        # This will be used to prevent insertion of duplicates
+        self.build_dependent_mappings(mim_gene_source_id, xref_dbi)
+        self.build_dependent_mappings(mim_morbid_source_id, xref_dbi)
+
+        mim_gene = self.get_valid_codes("MIM_GENE", species_id, xref_dbi)
+        mim_morbid = self.get_valid_codes("MIM_MORBID", species_id, xref_dbi)
+        entrez = self.get_valid_codes("EntrezGene", species_id, xref_dbi)
+
+        counters = {
+            "all_entries": 0,
+            "dependent_on_entrez": 0,
+            "missed_master": 0,
+            "missed_omim": 0,
+        }
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(file_io, delimiter="\t")
+
+        # Read lines
+        for line in csv_reader:
+            if not line:
+                continue
+
+            # Extract the header from among the comments
+            match = re.search(r"\A([#])?", line[0])
+            if match:
+                is_comment = match.group(1)
+                if is_comment:
+                    patterns = [
+                        r"\A[#]?\s*MIM[ ]number",
+                        "GeneID",
+                        "type",
+                        "Source",
+                        "MedGenCUI",
+                        "Comment",
+                    ]
+                    if len(
+                        line
+                    ) == EXPECTED_NUMBER_OF_COLUMNS and not self.is_file_header_valid(
+                        EXPECTED_NUMBER_OF_COLUMNS, patterns, line, True
+                    ):
+                        raise IOError(
+                            f"Malformed or unexpected header in Mim2Gene file {file}"
+                        )
+                    continue
+
+            if len(line) != EXPECTED_NUMBER_OF_COLUMNS:
+                raise IOError(
+                    f"Line {csv_reader.line_num} of input file {file} has an incorrect number of columns"
+                )
+
+            fields = [re.sub(r"\s+\Z", "", x) for x in line]
+            omim_acc = fields[0]
+            entrez_id = fields[1]
+            type = fields[2]
+            source = fields[3]
+            medgen = fields[4]
+            comment = fields[5]
+
+            counters["all_entries"] += 1
+
+            # No point in doing anything if we have no matching MIM xref ...
+            if omim_acc not in mim_gene and omim_acc not in mim_morbid:
+                counters["missed_omim"] += 1
+                continue
+
+            # ...or no EntrezGene xref to match it to
+            if not entrez_id or entrez_id not in entrez:
+                counters["missed_master"] += 1
+                continue
+
+            # Check if type is known
+            if verbose and type not in [
+                "gene",
+                "gene/phenotype",
+                "predominantly phenotypes",
+                "phenotype",
+            ]:
+                logging.warn(
+                    f"Unknown type {type} for MIM Number {omim_acc} ({file}:{csv_reader.line_num})"
+                )
+
+            # With all the checks taken care of, insert the mappings. We check
+            # both MIM_GENE and MIM_MORBID every time because some MIM entries
+            # can appear in both.
+            if omim_acc in mim_gene:
+                for mim_xref_id in mim_gene[omim_acc]:
+                    counters["dependent_on_entrez"] += self.process_xref_entry(
+                        {
+                            "mim_xref_id": mim_xref_id,
+                            "mim_source_id": mim_gene_source_id,
+                            "entrez_xrefs": entrez[entrez_id],
+                            "entrez_source_id": entrez_source_id,
+                        },
+                        xref_dbi,
+                    )
+            if omim_acc in mim_morbid:
+                for mim_xref_id in mim_morbid[omim_acc]:
+                    counters["dependent_on_entrez"] += self.process_xref_entry(
+                        {
+                            "mim_xref_id": mim_xref_id,
+                            "mim_source_id": mim_morbid_source_id,
+                            "entrez_xrefs": entrez[entrez_id],
+                            "entrez_source_id": entrez_source_id,
+                        },
+                        xref_dbi,
+                    )
+
+        file_io.close()
+
+        result_message = (
+            "Processed %d entries. Out of those\n" % counters["all_entries"]
+        )
+        result_message += "\t%d had missing OMIM entries,\n" % counters["missed_omim"]
+        result_message += (
+            "\t%d were dependent EntrezGene xrefs,\n" % counters["dependent_on_entrez"]
+        )
+        result_message += "\t%d had missing master entries." % counters["missed_master"]
+
+        return 0, result_message
+
+    def process_xref_entry(self, args: Dict[str, Any], dbi: Connection) -> int:
+        count = 0
+
+        for ent_id in args["entrez_xrefs"]:
+            self.add_dependent_xref_maponly(
+                args["mim_xref_id"], args["mim_source_id"], ent_id, None, dbi, True
+            )
+            count += 1
+
+        return count
diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
new file mode 100644
index 000000000..c7d4990eb
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
@@ -0,0 +1,193 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for RFAM source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class RFAMParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id       = args["source_id"]
+        species_id      = args["species_id"]
+        species_name    = args["species_name"]
+        file            = args["file"]
+        dba             = args["dba"]
+        ensembl_release = args["ensembl_release"]
+        xref_dbi        = args["xref_dbi"]
+        verbose         = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Extract db connection parameters from file
+        wget_url, db_user, db_host, db_port, db_name, db_pass = (
+            self.extract_params_from_string(
+                file, ["wget", "user", "host", "port", "dbname", "pass"]
+            )
+        )
+        if not db_user:
+            db_user = "ensro"
+        if not db_port:
+            db_port = "3306"
+
+        # Get the species name(s)
+        species_id_to_names = self.species_id_to_names(xref_dbi)
+        if species_name:
+            species_id_to_names.setdefault(species_id, []).append(species_name)
+
+        if not species_id_to_names.get(species_id):
+            return 0, "Skipped. Could not find species ID to name mapping"
+
+        species_name = species_id_to_names[species_id][0]
+
+        # Connect to the appropriate rfam db
+        if db_host:
+            rfam_db_url = URL.create(
+                "mysql", db_user, db_pass, db_host, db_port, db_name
+            )
+        elif dba:
+            rfam_db_url = dba
+        else:
+            if verbose:
+                logging.info("Looking for db in mysql-ens-sta-1")
+            registry = "ensro@mysql-ens-sta-1:4519"
+            rfam_db_url = self.get_db_from_registry(
+                species_name, "core", ensembl_release, registry
+            )
+
+        if not rfam_db_url:
+            raise IOError(f"Could not find RFAM DB.")
+        else:
+            if verbose:
+                logging.info(f"Found RFAM DB: {rfam_db_url}")
+
+        # Get data from rfam db
+        db_engine = self.get_db_engine(rfam_db_url)
+        with db_engine.connect() as rfam_dbi:
+            query = (
+                select(
+                    TranscriptORM.stable_id.distinct(),
+                    DnaAlignFeatureORM.hit_name,
+                    AnalysisORM.analysis_id,
+                )
+                .join(
+                    TranscriptORM,
+                    and_(
+                        TranscriptORM.analysis_id == AnalysisORM.analysis_id,
+                        AnalysisORM.logic_name.like("ncrna%"),
+                        TranscriptORM.biotype != "miRNA",
+                    ),
+                )
+                .join(
+                    ExonTranscriptORM,
+                    ExonTranscriptORM.transcript_id == TranscriptORM.transcript_id,
+                )
+                .join(
+                    SupportingFeatureORM,
+                    and_(
+                        SupportingFeatureORM.exon_id == ExonTranscriptORM.exon_id,
+                        SupportingFeatureORM.feature_type == "dna_align_feature",
+                    ),
+                )
+                .join(
+                    DnaAlignFeatureORM,
+                    DnaAlignFeatureORM.dna_align_feature_id
+                    == SupportingFeatureORM.feature_id,
+                )
+                .order_by(DnaAlignFeatureORM.hit_name)
+            )
+            result = rfam_dbi.execute(query).mappings().all()
+
+        # Create a dict with RFAM accessions as keys and value is an array of ensembl transcript stable_ids
+        rfam_transcript_stable_ids = {}
+        for row in result:
+            rfam_id = None
+
+            match = re.search(r"^(RF\d+)", row.hit_name)
+            if match:
+                rfam_id = match.group(1)
+
+            if rfam_id:
+                rfam_transcript_stable_ids.setdefault(rfam_id, []).append(row.stable_id)
+
+        # Download file through wget if url present
+        if wget_url:
+            uri = urlparse(wget_url)
+            file = os.path.join(os.path.dirname(file), os.path.basename(uri.path))
+            wget.download(wget_url, file)
+
+        # Read data from file
+        lines = []
+        entry = ""
+
+        file_io = gzip.open(file, "r")
+        for line in file_io:
+            line = line.decode("latin-1")
+            if re.search(r"^//", line):
+                lines.append(entry)
+                entry = ""
+            elif (
+                re.search(r"^#=GF\sAC", line)
+                or re.search(r"^#=GF\sID", line)
+                or re.search(r"^#=GF\sDE", line)
+            ):
+                entry += line
+        file_io.close()
+
+        # Add xrefs
+        xref_count, direct_count = 0, 0
+
+        for entry in lines:
+            accession, label, description = None, None, None
+
+            # Extract data from entry
+            match = re.search(r"^#=GF\sAC\s+(\w+)", entry, flags=re.MULTILINE)
+            if match:
+                accession = match.group(1)
+            match = re.search(r"^#=GF\sID\s+([^\n]+)", entry, flags=re.MULTILINE)
+            if match:
+                label = match.group(1)
+            match = re.search(r"^#=GF\sDE\s+([^\n]+)", entry, flags=re.MULTILINE)
+            if match:
+                description = match.group(1)
+
+            if accession:
+                if rfam_transcript_stable_ids.get(accession):
+                    xref_id = self.add_xref(
+                        {
+                            "accession": accession,
+                            "version": 0,
+                            "label": label or accession,
+                            "description": description,
+                            "source_id": source_id,
+                            "species_id": species_id,
+                            "info_type": "DIRECT",
+                        },
+                        xref_dbi,
+                    )
+                    xref_count += 1
+
+                    transcript_stable_ids = rfam_transcript_stable_ids[accession]
+                    for stable_id in transcript_stable_ids:
+                        self.add_direct_xref(
+                            xref_id, stable_id, "Transcript", "", xref_dbi
+                        )
+                        direct_count += 1
+
+        result_message = (
+            f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs"
+        )
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/RGDParser.py b/src/python/ensembl/production/xrefs/parsers/RGDParser.py
new file mode 100644
index 000000000..11ddd0e0e
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/RGDParser.py
@@ -0,0 +1,154 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for RGD source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class RGDParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        direct_source_id = self.get_source_id_for_source_name(
+            "RGD", xref_dbi, "direct_xref"
+        )
+
+        # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
+        preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+
+        rgd_io = self.get_filehandle(file)
+        csv_reader = csv.DictReader(
+            filter(lambda row: row[0] != "#", rgd_io), delimiter="\t"
+        )
+
+        header_found, count, ensembl_count, mismatch, syn_count = 0, 0, 0, 0, 0
+        columns = {}
+
+        # Read lines
+        for line in csv_reader:
+            # Don't bother doing anything if we don't have an RGD ID
+            if not line.get("GENE_RGD_ID") or not line["GENE_RGD_ID"]:
+                continue
+
+            # Some RGD annotation is directly copied from Ensembl
+            if re.search("ENSRNO", line["SYMBOL"]):
+                continue
+
+            genbank_nucleotides = []
+            if line.get("GENBANK_NUCLEOTIDE"):
+                genbank_nucleotides = line["GENBANK_NUCLEOTIDE"].split(";")
+
+            done = 0
+            # The nucleotides are sorted in the file in alphabetical order. Filter them down
+            # to a higher quality subset, then add dependent Xrefs where possible
+            for nucleotide in self.sort_refseq_accessions(genbank_nucleotides):
+                if not done and preloaded_refseq.get(nucleotide):
+                    for xref in preloaded_refseq[nucleotide]:
+                        xref_id = self.add_dependent_xref(
+                            {
+                                "master_xref_id": xref,
+                                "accession": line["GENE_RGD_ID"],
+                                "label": line["SYMBOL"],
+                                "description": line["NAME"],
+                                "source_id": source_id,
+                                "species_id": species_id,
+                            },
+                            xref_dbi,
+                        )
+
+                        count += 1
+                        syn_count += self.process_synonyms(
+                            xref_id, line["OLD_SYMBOL"], xref_dbi
+                        )
+                        done = 1
+
+            # Add direct xrefs
+            if line.get("ENSEMBL_ID"):
+                ensembl_ids = line["ENSEMBL_ID"].split(";")
+
+                for id in ensembl_ids:
+                    self.add_to_direct_xrefs(
+                        {
+                            "stable_id": id,
+                            "ensembl_type": "gene",
+                            "accession": line["GENE_RGD_ID"],
+                            "label": line["SYMBOL"],
+                            "description": line["NAME"],
+                            "source_id": direct_source_id,
+                            "species_id": species_id,
+                        },
+                        xref_dbi,
+                    )
+                    xref_id = self.get_xref_id(
+                        line["GENE_RGD_ID"], direct_source_id, species_id, xref_dbi
+                    )
+
+                    ensembl_count += 1
+                    syn_count += self.process_synonyms(
+                        xref_id, line["OLD_SYMBOL"], xref_dbi
+                    )
+                    done = 1
+
+            # If neither direct or dependent, add misc xref
+            if not done:
+                xref_id = self.add_xref(
+                    {
+                        "accession": line["GENE_RGD_ID"],
+                        "label": line["SYMBOL"],
+                        "description": line["NAME"],
+                        "source_id": source_id,
+                        "species_id": species_id,
+                        "info_type": "MISC",
+                    },
+                    xref_dbi,
+                )
+                mismatch += 1
+
+        rgd_io.close()
+
+        result_message = f"{count} xrefs succesfully loaded and dependent on refseq\n"
+        result_message += f"\t{mismatch} xrefs added but with NO dependencies\n"
+        result_message += f"\t{ensembl_count} direct xrefs successfully loaded\n"
+        result_message += f"\tAdded {syn_count} synonyms, including duplicates"
+
+        return 0, result_message
+
+    def sort_refseq_accessions(self, accessions: List[str]) -> List[str]:
+        refseq_priorities = {"NM": 1, "NP": 1, "NR": 1, "XM": 2, "XP": 2, "XR": 2}
+
+        accessions = sorted(
+            [x for x in accessions if x[:2] in refseq_priorities],
+            key=lambda x: (refseq_priorities[x[:2]], x),
+        )
+        return accessions
+
+    def process_synonyms(self, xref_id: int, synonym_string: str, dbi: Connection) -> int:
+        syn_count = 0
+
+        if not synonym_string or not xref_id:
+            return syn_count
+
+        synonyms = synonym_string.split(";")
+        for synonym in synonyms:
+            self.add_synonym(xref_id, synonym, dbi)
+            syn_count += 1
+
+        return syn_count
diff --git a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
new file mode 100644
index 000000000..4ae9b46d8
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
@@ -0,0 +1,189 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for Reactome source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class ReactomeParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id    = args["source_id"]
+        species_id   = args["species_id"]
+        species_name = args["species_name"]
+        file         = args["file"]
+        release_file = args["rel_file"]
+        xref_dbi     = args["xref_dbi"]
+        verbose      = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Parse release file
+        if release_file:
+            release = None
+
+            release_io = self.get_filehandle(release_file)
+            for line in release_io:
+                match = re.search(r"([0-9]*)", line)
+                if match:
+                    release = match.group(1)
+                    if verbose:
+                        logging.info(f"Reactome release is '{release}'")
+            release_io.close()
+
+            if not release:
+                raise IOError(f"Could not find release using {release_file}")
+
+            self.set_release(source_id, release, xref_dbi)
+
+        # Create a hash of all valid names for this species
+        species_to_alias = self.species_id_to_names(xref_dbi)
+        if species_name:
+            species_to_alias.setdefault(species_id, []).append(species_name)
+
+        if not species_to_alias.get(species_id):
+            return 0, "Skipped. Could not find species ID to name mapping"
+
+        aliases = species_to_alias[species_id]
+        alias_to_species_id = {alias: 1 for alias in aliases}
+
+        # Get relevant source ids
+        reactome_source_id = self.get_source_id_for_source_name(
+            "reactome", xref_dbi, "direct"
+        )
+        transcript_reactome_source_id = self.get_source_id_for_source_name(
+            "reactome_transcript", xref_dbi
+        )
+        gene_reactome_source_id = self.get_source_id_for_source_name(
+            "reactome_gene", xref_dbi
+        )
+        reactome_uniprot_source_id = self.get_source_id_for_source_name(
+            "reactome", xref_dbi, "uniprot"
+        )
+
+        # Cannot continue unless source ids are found
+        if (
+            not reactome_source_id
+            or not transcript_reactome_source_id
+            or not gene_reactome_source_id
+        ):
+            raise KeyError("Could not find source id for reactome sources")
+        else:
+            if verbose:
+                logging.info(f"Source_id = {reactome_source_id}")
+                logging.info(f"Transcript_source_id = {transcript_reactome_source_id}")
+                logging.info(f"Gene_source_id = {gene_reactome_source_id}")
+
+        if not reactome_uniprot_source_id:
+            raise KeyError("Could not find source id for reactome uniprot")
+        else:
+            if verbose:
+                logging.info(f"Uniprot_source_id = {reactome_uniprot_source_id}")
+
+        # Get uniprot accessions
+        is_uniprot = 0
+        uniprot_accessions = {}
+        if re.search("UniProt", file):
+            is_uniprot = 1
+            uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi)
+
+        parsed_count, err_count = 0, 0
+
+        # Read file
+        reactome_io = self.get_filehandle(file)
+
+        for line in reactome_io:
+            line = line.strip()
+
+            (ensembl_stable_id, reactome_id, url, description, evidence, species) = (
+                re.split(r"\t+", line)
+            )
+
+            # Check description pattern
+            match = re.search(
+                r"^[A-Za-z0-9_,\(\)\/\-\.:\+'&;\"\/\?%>\s\[\]]+$", description
+            )
+            if not match:
+                continue
+
+            species = re.sub(r"\s", "_", species)
+            species = species.lower()
+
+            current_source_id = reactome_source_id
+
+            if alias_to_species_id.get(species):
+                parsed_count += 1
+
+                ensembl_type = None
+                info_type = "DIRECT"
+
+                # Add uniprot dependent xrefs
+                if is_uniprot:
+                    if uniprot_accessions.get(ensembl_stable_id):
+                        for xref in uniprot_accessions[ensembl_stable_id]:
+                            xref_id = self.add_dependent_xref(
+                                {
+                                    "master_xref_id": xref,
+                                    "accession": reactome_id,
+                                    "label": reactome_id,
+                                    "description": description,
+                                    "source_id": reactome_uniprot_source_id,
+                                    "species_id": species_id,
+                                },
+                                xref_dbi,
+                            )
+                        info_type = "DEPENDENT"
+
+                # Attempt to guess the object_type based on the stable id
+                elif re.search(r"G[0-9]*$", ensembl_stable_id):
+                    ensembl_type = "gene"
+                    current_source_id = gene_reactome_source_id
+                elif re.search(r"T[0-9]*$", ensembl_stable_id):
+                    ensembl_type = "transcript"
+                    current_source_id = transcript_reactome_source_id
+                elif re.search(r"P[0-9]*$", ensembl_stable_id):
+                    ensembl_type = "translation"
+
+                # Is not in Uniprot and does not match Ensembl stable id format
+                else:
+                    if verbose:
+                        logging.debug(f"Could not find type for {ensembl_stable_id}")
+                    err_count += 1
+                    continue
+
+                # Add new entry for reactome xref as well as direct xref to ensembl stable id
+                xref_id = self.add_xref(
+                    {
+                        "accession": reactome_id,
+                        "label": reactome_id,
+                        "description": description,
+                        "source_id": current_source_id,
+                        "species_id": species_id,
+                        "info_type": info_type,
+                    },
+                    xref_dbi,
+                )
+
+                if ensembl_type:
+                    self.add_direct_xref(
+                        xref_id, ensembl_stable_id, ensembl_type, "", xref_dbi
+                    )
+
+        reactome_io.close()
+
+        result_message = f"{parsed_count} entries processed\n"
+        result_message += f"{err_count} not found"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
new file mode 100644
index 000000000..14f6f76dd
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
@@ -0,0 +1,96 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for RefSeq coordinate xrefs."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+from ensembl.common.RangeRegistry import RangeRegistry
+
+
+class RefSeqCoordinateParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id       = args["source_id"]
+        species_id      = args["species_id"]
+        species_name    = args["species_name"]
+        file            = args["file"]
+        dba             = args["dba"]
+        ensembl_release = args["ensembl_release"]
+        xref_dbi        = args["xref_dbi"]
+        verbose         = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        source_ids = {
+            "peptide": self.get_source_id_for_source_name(
+                "RefSeq_peptide", xref_dbi, "otherfeatures"
+            ),
+            "mrna": self.get_source_id_for_source_name(
+                "RefSeq_mRNA", xref_dbi, "otherfeatures"
+            ),
+            "ncrna": self.get_source_id_for_source_name(
+                "RefSeq_ncRNA", xref_dbi, "otherfeatures"
+            ),
+            "peptide_predicted": self.get_source_id_for_source_name(
+                "RefSeq_peptide_predicted", xref_dbi, "otherfeatures"
+            ),
+            "mrna_predicted": self.get_source_id_for_source_name(
+                "RefSeq_mRNA_predicted", xref_dbi, "otherfeatures"
+            ),
+            "ncrna_predicted": self.get_source_id_for_source_name(
+                "RefSeq_ncRNA_predicted", xref_dbi, "otherfeatures"
+            ),
+            "entrezgene": self.get_source_id_for_source_name("EntrezGene", xref_dbi),
+            "wikigene": self.get_source_id_for_source_name("WikiGene", xref_dbi),
+        }
+
+        if verbose:
+            logging.info(f'RefSeq_peptide source ID = {source_ids["peptide"]}')
+            logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna"]}')
+            logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna"]}')
+            logging.info(
+                f'RefSeq_peptide_predicted source ID = {source_ids["peptide_predicted"]}'
+            )
+            logging.info(
+                f'RefSeq_mRNA_predicted source ID = {source_ids["mrna_predicted"]}'
+            )
+            logging.info(
+                f'RefSeq_ncRNA_predicted source ID = {source_ids["ncrna_predicted"]}'
+            )
+
+        # Get the species name(s)
+        species_id_to_names = self.species_id_to_names(xref_dbi)
+        if species_name:
+            species_id_to_names.setdefault(species_id, []).append(species_name)
+
+        if not species_id_to_names.get(species_id):
+            return 0, "Skipped. Could not find species ID to name mapping."
+        species_name = species_id_to_names[species_id][0]
+
+        # Connect to the appropriate dbs
+        if dba:
+            scripts_dir = args["perl_scripts_dir"]
+            xref_db_url = args["xref_db_url"]
+            source_ids_json = json.dumps(source_ids)
+
+            logging.info(
+                f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl"
+            )
+            perl_cmd = f"perl {scripts_dir}/refseq_coordinate_parser.pl --xref_db_url '{xref_db_url}' --core_db_url '{args['core_db_url']}' --otherf_db_url '{dba}' --source_ids '{source_ids_json}' --species_id {species_id} --species_name {species_name} --release {ensembl_release}"
+            cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE)
+
+            return 0, "Added refseq_import xrefs."
+        else:
+            # Not all species have an otherfeatures database, skip if not found
+            return 0, f"Skipped. No otherfeatures database for '{species_name}'."
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py
new file mode 100644
index 000000000..93d773270
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py
@@ -0,0 +1,341 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for RefSeq sources (dna and peptide)."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class RefSeqGPFFParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id    = args["source_id"]
+        species_id   = args["species_id"]
+        species_name = args["species_name"]
+        file         = args["file"]
+        release_file = args["rel_file"]
+        xref_dbi     = args["xref_dbi"]
+        verbose      = args.get("verbose", False)
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Get needed source ids
+        source_ids = {
+            "peptide_source_id": self.get_source_id_for_source_name(
+                "RefSeq_peptide", xref_dbi
+            ),
+            "mrna_source_id": self.get_source_id_for_source_name(
+                "RefSeq_mRNA", xref_dbi, "refseq"
+            ),
+            "ncrna_source_id": self.get_source_id_for_source_name(
+                "RefSeq_ncRNA", xref_dbi
+            ),
+            "pred_peptide_source_id": self.get_source_id_for_source_name(
+                "RefSeq_peptide_predicted", xref_dbi
+            ),
+            "pred_mrna_source_id": self.get_source_id_for_source_name(
+                "RefSeq_mRNA_predicted", xref_dbi, "refseq"
+            ),
+            "pred_ncrna_source_id": self.get_source_id_for_source_name(
+                "RefSeq_ncRNA_predicted", xref_dbi
+            ),
+            "entrez_source_id": self.get_source_id_for_source_name(
+                "EntrezGene", xref_dbi
+            ),
+            "wiki_source_id": self.get_source_id_for_source_name("WikiGene", xref_dbi),
+        }
+
+        if verbose:
+            logging.info(
+                f'RefSeq_peptide source ID = {source_ids["peptide_source_id"]}'
+            )
+            logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna_source_id"]}')
+            logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna_source_id"]}')
+            logging.info(
+                f'RefSeq_peptide_predicted source ID = {source_ids["pred_peptide_source_id"]}'
+            )
+            logging.info(
+                f'RefSeq_mRNA_predicted source ID = {source_ids["pred_mrna_source_id"]}'
+            )
+            logging.info(
+                f'RefSeq_ncRNA_predicted source ID = {source_ids["pred_ncrna_source_id"]}'
+            )
+            logging.info(f'EntrezGene source ID = {source_ids["entrez_source_id"]}')
+            logging.info(f'WikiGene source ID = {source_ids["wiki_source_id"]}')
+
+        # Extract version from release file
+        if release_file:
+            # Parse and set release info
+            index = 0
+            for section in self.get_file_sections(release_file, "***"):
+                index += 1
+                if index == 2:
+                    release = "".join(section)
+                    release = re.sub(r"\s{2,}", " ", release)
+                    release = release.strip()
+                    release = re.sub(
+                        r".*(NCBI Reference Sequence.*) Distribution.*", r"\1", release
+                    )
+                    release = re.sub(r"Release (\d+)", r"Release \1,", release)
+                    break
+
+            # Set releases
+            self.set_release(source_ids["peptide_source_id"], release, xref_dbi)
+            self.set_release(source_ids["mrna_source_id"], release, xref_dbi)
+            self.set_release(source_ids["ncrna_source_id"], release, xref_dbi)
+            self.set_release(source_ids["pred_mrna_source_id"], release, xref_dbi)
+            self.set_release(source_ids["pred_ncrna_source_id"], release, xref_dbi)
+            self.set_release(source_ids["pred_peptide_source_id"], release, xref_dbi)
+
+        result_message = self.create_xrefs(
+            source_ids, species_id, species_name, file, xref_dbi
+        )
+
+        return 0, result_message
+
+    def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name: str, file: str, dbi: Connection) -> str:
+        counts = {
+            "num_mrna": 0,
+            "num_ncrna": 0,
+            "num_pred_mrna": 0,
+            "num_pred_ncrna": 0,
+            "num_peptide": 0,
+            "num_pred_peptide": 0,
+            "num_entrez": 0,
+            "num_wiki": 0,
+        }
+
+        # Create a dict of all valid names for this species
+        species_id_to_names = self.species_id_to_names(dbi)
+        if species_name:
+            species_id_to_names.setdefault(species_id, []).append(species_name)
+        if not species_id_to_names.get(species_id):
+            return "Skipped. Could not find species ID to name mapping"
+        names = species_id_to_names[species_id]
+        name_to_species_id = {name: species_id for name in names}
+
+        # Create a dict of all valid taxon_ids for this species
+        species_id_to_tax = self.species_id_to_taxonomy(dbi)
+        species_id_to_tax.setdefault(species_id, []).append(species_id)
+        tax_ids = species_id_to_tax[species_id]
+        tax_to_species_id = {tax_id: species_id for tax_id in tax_ids}
+
+        # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs
+        entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi)
+        refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi)
+        refseq_ids.update(
+            self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi)
+        )
+        entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi)
+        wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi)
+
+        # Get file type
+        file_type = self.type_from_file(os.path.basename(file))
+        if not file_type:
+            return f"Could not work out sequence type for {file}"
+
+        xrefs = []
+
+        # Read file
+        for section in self.get_file_sections(file, "//\n"):
+            if len(section) == 1:
+                continue
+
+            entry = "".join(section)
+            xref = {}
+
+            # Extract the species name
+            species_id_check = None
+            match = re.search(r"\s+ORGANISM\s+(.*)\n", entry)
+            if match:
+                species = match.group(1).lower()
+                species = re.sub(r"^\s*", "", species)
+                species = re.sub(r"\s*\(.+\)", "", species)
+                species = re.sub(r"\s+", "_", species)
+                species = re.sub(r"\n", "", species)
+
+                species_id_check = name_to_species_id[species]
+
+            # Try going through the taxon ID if species check didn't work
+            if not species_id_check:
+                match = re.search(r"db_xref=\"taxon:(\d+)\"", entry)
+                if match:
+                    taxon_id = match.group(1)
+                    species_id_check = tax_to_species_id[taxon_id]
+
+            # Skip xrefs for species that aren't in the species table
+            if not species_id_check or species_id != species_id_check:
+                continue
+
+            # Extract accession and version
+            accession = re.search(
+                r"^ACCESSION\s+(\S+)", entry, flags=re.MULTILINE
+            ).group(1)
+            version = re.search(r"^VERSION\s+(\S+)", entry, flags=re.MULTILINE).group(1)
+
+            # Get the right source ID based on file type and whether this is predicted (X*) or not
+            source_id = 0
+            if file_type == "dna":
+                if re.search(r"^XM_", accession):
+                    source_id = source_ids["pred_mrna_source_id"]
+                    counts["num_pred_mrna"] += 1
+                elif re.search(r"^XR", accession):
+                    source_id = source_ids["pred_ncrna_source_id"]
+                    counts["num_pred_ncrna"] += 1
+                elif re.search(r"^NM", accession):
+                    source_id = source_ids["mrna_source_id"]
+                    counts["num_mrna"] += 1
+                elif re.search(r"^NR", accession):
+                    source_id = source_ids["ncrna_source_id"]
+                    counts["num_ncrna"] += 1
+            elif file_type == "peptide":
+                if re.search(r"^XP_", accession):
+                    source_id = source_ids["pred_peptide_source_id"]
+                    counts["num_pred_peptide"] += 1
+                else:
+                    source_id = source_ids["peptide_source_id"]
+                    counts["num_peptide"] += 1
+
+            if not source_id:
+                logging.warning(
+                    f"Could not get source ID for file type {file_type} for accession {accession}"
+                )
+
+            (acc_no_version, version) = version.split(".")
+            xref["ACCESSION"] = accession
+            if accession == acc_no_version:
+                xref["VERSION"] = version
+
+            # Extract description (may be multi-line)
+            description = re.search(
+                r"^DEFINITION\s+([^[]+)", entry, flags=re.MULTILINE
+            ).group(1)
+            description = re.sub(r"\nACCESSION.*", "", description, flags=re.DOTALL)
+            description = re.sub(r"\n", "", description)
+            description = re.sub(r"{.*}-like", "", description)
+            description = re.sub(r"{.*}", "", description)
+            description = re.sub(r"\s+", " ", description)
+            if len(description) > 255:
+                description = description[0:255]
+
+            # Extract sequence
+            sequence = re.search(
+                r"^\s*ORIGIN\s+(.+)", entry, flags=re.DOTALL | re.MULTILINE
+            ).group(1)
+            sequence_lines = sequence.split("\n")
+            parsed_sequence = ""
+            for seq_line in sequence_lines:
+                if seq_line:
+                    sequence_only = re.search(r"^\s*\d+\s+(.*)$", seq_line).group(1)
+                    if not sequence_only:
+                        continue
+                    parsed_sequence += sequence_only
+            parsed_sequence = re.sub(r"\s", "", parsed_sequence)
+
+            # Extract related pair to current RefSeq accession
+            # For rna file, the pair is the protein_id
+            # For peptide file, the pair is in DBSOURCE REFSEQ accession
+            refseq_pair = None
+            match = re.search(r"DBSOURCE\s+REFSEQ: accession (\S+)", entry)
+            if match:
+                refseq_pair = match.group(1)
+            protein_id = re.findall(r"\/protein_id=.(\S+_\d+)", entry)
+            coded_by = re.findall(r"\/coded_by=.(\w+_\d+)", entry)
+
+            for cb in coded_by:
+                xref["PAIR"] = cb
+
+            if not xref.get("PAIR"):
+                xref["PAIR"] = refseq_pair
+
+            if not xref.get("PAIR"):
+                for pi in protein_id:
+                    xref["PAIR"] = pi
+
+            xref["LABEL"] = f"{accession}.{version}"
+            xref["DESCRIPTION"] = description
+            xref["SOURCE_ID"] = source_id
+            xref["SEQUENCE"] = parsed_sequence
+            xref["SEQUENCE_TYPE"] = file_type
+            xref["SPECIES_ID"] = species_id
+            xref["INFO_TYPE"] = "SEQUENCE_MATCH"
+            xref["DEPENDENT_XREFS"] = []
+
+            # Extrat NCBIGene ids
+            seen_in_record = {}
+            ncbi_gene_ids = re.findall(r"db_xref=.GeneID:(\d+)", entry)
+            for gene_id in ncbi_gene_ids:
+                if not seen_in_record.get(gene_id) and entrez_acc_to_label.get(gene_id):
+                    seen_in_record[gene_id] = 1
+
+                    dependent = {}
+                    dependent["SOURCE_ID"] = source_ids["entrez_source_id"]
+                    dependent["LINKAGE_SOURCE_ID"] = source_id
+                    dependent["ACCESSION"] = gene_id
+                    dependent["LABEL"] = entrez_acc_to_label[gene_id]
+                    xref["DEPENDENT_XREFS"].append(dependent)
+                    counts["num_entrez"] += 1
+
+                    dependent = {}
+                    dependent["SOURCE_ID"] = source_ids["wiki_source_id"]
+                    dependent["LINKAGE_SOURCE_ID"] = source_id
+                    dependent["ACCESSION"] = gene_id
+                    dependent["LABEL"] = entrez_acc_to_label[gene_id]
+                    xref["DEPENDENT_XREFS"].append(dependent)
+                    counts["num_wiki"] += 1
+
+                    # Add xrefs for RefSeq mRNA as well where available
+                    if refseq_pair:
+                        refseq_pair = re.sub(r"\.[0-9]*", "", refseq_pair)
+                    if refseq_pair:
+                        if refseq_ids.get(refseq_pair):
+                            for refseq_id in refseq_ids[refseq_pair]:
+                                for entrez_id in entrez_ids.get(gene_id):
+                                    self.add_dependent_xref_maponly(
+                                        entrez_id,
+                                        source_ids["entrez_source_id"],
+                                        refseq_id,
+                                        None,
+                                        dbi,
+                                    )
+                                for wiki_id in wiki_ids.get(gene_id):
+                                    self.add_dependent_xref_maponly(
+                                        wiki_id,
+                                        source_ids["entrez_source_id"],
+                                        refseq_id,
+                                        None,
+                                        dbi,
+                                    )
+
+            xrefs.append(xref)
+
+        if len(xrefs) > 0:
+            self.upload_xref_object_graphs(xrefs, dbi)
+
+        result_message = f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, {counts["num_ncrna"]} ncRNA xrefs, {counts["num_pred_ncrna"]} predicted ncRNA xrefs, {counts["num_peptide"]} peptide xrefs, and {counts["num_pred_peptide"]} predicted peptide xrefs\n'
+        result_message += f"Added the following dependent xrefs:\n"
+        result_message += f'\tEntrezGene\t{counts["num_entrez"]}\n'
+        result_message += f'\tWikiGene\t{counts["num_wiki"]}\n'
+
+        return result_message
+
+    def type_from_file(self, file_name: str) -> Optional[str]:
+        if re.search("RefSeq_protein", file_name):
+            return "peptide"
+        if re.search("rna", file_name):
+            return "dna"
+        if re.search("protein", file_name):
+            return "peptide"
+
+        return None
diff --git a/src/python/ensembl/production/xrefs/parsers/UCSCParser.py b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py
new file mode 100644
index 000000000..5de152912
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py
@@ -0,0 +1,136 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for UCSC source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class UCSCParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        count = 0
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(file_io, delimiter="\t", strict=True)
+
+        # Read lines
+        for line in csv_reader:
+            chromosome = line[1]
+            strand = line[2]
+            tx_start = int(line[3])
+            tx_end = int(line[4])
+            cds_start = int(line[5])
+            cds_end = int(line[6])
+            exon_starts = line[8]
+            exon_ends = line[9]
+            accession = line[11]
+
+            # UCSC uses slightly different chromosome names, at least for
+            # human and mouse, so chop off the 'chr' in the beginning.  We do
+            # not yet translate the names of the special chromosomes, e.g.
+            # "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl)
+            chromosome = re.sub(r"\Achr", "", chromosome)
+
+            # They also use '+' and '-' for the strand, instead of -1, 0, or 1
+            if strand == "+":
+                strand = 1
+            elif strand == "-":
+                strand = -1
+            else:
+                strand = 0
+
+            # ... and non-coding transcripts have cds_start == cds_end.
+            # We would like these to be stored as NULLs
+            if cds_start == cds_end:
+                cds_start = None
+                cds_end = None
+
+            # exon_starts and exon_ends usually have trailing commas, remove them
+            exon_starts = re.sub(r",\Z", "", exon_starts)
+            exon_ends = re.sub(r",\Z", "", exon_ends)
+
+            # ... and they use the same kind of "inbetween" coordinates as e.g.
+            # exonerate, so increment all start coordinates by one
+            tx_start += 1
+            if cds_start:
+                cds_start += 1
+
+            # The string exon_starts is a comma-separated list of start coordinates
+            # for subsequent exons and we must increment each one. Split the string
+            # on commas, use map() to apply the "+1" transformation to every
+            # element of the resulting array, then join the result into a new
+            # comma-separated list
+            exon_starts = ",".join(
+                str(int(x) + 1) for x in re.split(r"\s*,\s*", exon_starts)
+            )
+
+            self.add_xref(
+                source_id,
+                species_id,
+                {
+                    "accession": accession,
+                    "chromosome": chromosome,
+                    "strand": strand,
+                    "txStart": tx_start,
+                    "txEnd": tx_end,
+                    "cdsStart": cds_start,
+                    "cdsEnd": cds_end,
+                    "exonStarts": exon_starts,
+                    "exonEnds": exon_ends,
+                },
+                xref_dbi,
+            )
+            count += 1
+
+        file_io.close()
+
+        result_message = f"Loaded a total of {count} UCSC xrefs"
+
+        return 0, result_message
+
+    def add_xref(self, source_id: int, species_id: int, xref: Dict[str, Any], dbi: Connection) -> None:
+        for required_key in [
+            "accession",
+            "chromosome",
+            "strand",
+            "txStart",
+            "txEnd",
+            "exonStarts",
+            "exonEnds",
+        ]:
+            if not xref.get(required_key):
+                raise KeyError(f"Missing required key {required_key} for Xref")
+
+        query = insert(CoordinateXrefORM).values(
+            source_id=source_id,
+            species_id=species_id,
+            accession=xref["accession"],
+            chromosome=xref["chromosome"],
+            strand=xref["strand"],
+            txStart=xref["txStart"],
+            txEnd=xref["txEnd"],
+            cdsStart=xref["cdsStart"],
+            cdsEnd=xref["cdsEnd"],
+            exonStarts=xref["exonStarts"],
+            exonEnds=xref["exonEnds"],
+        )
+        dbi.execute(query)
diff --git a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
new file mode 100644
index 000000000..e99b33cdc
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
@@ -0,0 +1,452 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for Uniprot sources."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+import codecs
+
+
+class UniProtParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id    = args["source_id"]
+        species_id   = args["species_id"]
+        file         = args["file"]
+        xref_dbi     = args["xref_dbi"]
+        release_file = args["rel_file"]
+        verbose      = args.get("verbose", False)
+        hgnc_file    = args.get("hgnc_file")
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Get needed source ids
+        source_ids = {
+            "sp_source_id": self.get_source_id_for_source_name(
+                "Uniprot/SWISSPROT", xref_dbi, "sequence_mapped"
+            ),
+            "sptr_source_id": self.get_source_id_for_source_name(
+                "Uniprot/SPTREMBL", xref_dbi, "sequence_mapped"
+            ),
+            "sptr_non_display_source_id": self.get_source_id_for_source_name(
+                "Uniprot/SPTREMBL", xref_dbi, "protein_evidence_gt_2"
+            ),
+            "sp_direct_source_id": self.get_source_id_for_source_name(
+                "Uniprot/SWISSPROT", xref_dbi, "direct"
+            ),
+            "sptr_direct_source_id": self.get_source_id_for_source_name(
+                "Uniprot/SPTREMBL", xref_dbi, "direct"
+            ),
+            "isoform_source_id": self.get_source_id_for_source_name(
+                "Uniprot_isoform", xref_dbi
+            ),
+        }
+
+        if verbose:
+            logging.info(f'SwissProt source ID = {source_ids["sp_source_id"]}')
+            logging.info(f'SpTREMBL source ID = {source_ids["sptr_source_id"]}')
+            logging.info(
+                f'SpTREMBL protein_evidence > 2 source ID = {source_ids["sptr_non_display_source_id"]}'
+            )
+            logging.info(
+                f'SwissProt direct source ID = {source_ids["sp_direct_source_id"]}'
+            )
+            logging.info(
+                f'SpTREMBL direct source ID = {source_ids["sptr_direct_source_id"]}'
+            )
+
+        # Parse and set release info
+        if release_file:
+            sp_release = None
+            sptr_release = None
+
+            release_io = self.get_filehandle(release_file)
+            for line in release_io:
+                line = line.strip()
+                if not line:
+                    continue
+
+                match = re.search(r"(UniProtKB/Swiss-Prot Release .*)", line)
+                if match:
+                    sp_release = match.group(1)
+                    if verbose:
+                        logging.info(f"Swiss-Prot release is {sp_release}")
+                else:
+                    match = re.search(r"(UniProtKB/TrEMBL Release .*)", line)
+                    if match:
+                        sptr_release = match.group(1)
+                        if verbose:
+                            logging.info(f"SpTrEMBL release is {sptr_release}")
+
+            release_io.close()
+
+            # Set releases
+            self.set_release(source_ids["sp_source_id"], sp_release, xref_dbi)
+            self.set_release(source_ids["sptr_source_id"], sptr_release, xref_dbi)
+            self.set_release(
+                source_ids["sptr_non_display_source_id"], sptr_release, xref_dbi
+            )
+            self.set_release(source_ids["sp_direct_source_id"], sp_release, xref_dbi)
+            self.set_release(
+                source_ids["sptr_direct_source_id"], sptr_release, xref_dbi
+            )
+
+        result_message = self.create_xrefs(source_ids, species_id, file, xref_dbi, hgnc_file)
+
+        return 0, result_message
+
+    def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, dbi: Connection, hgnc_file: str = None) -> str:
+        counts = {
+            "num_sp": 0,
+            "num_sptr": 0,
+            "num_sptr_non_display": 0,
+            "num_direct_sp": 0,
+            "num_direct_sptr": 0,
+            "num_isoform": 0,
+        }
+        dependent_xrefs_counts = {}
+        ensembl_derived_protein_count = 0
+        count = 0
+
+        # Get sources ids of dependent sources
+        dependent_sources = self.get_xref_sources(dbi)
+
+        # Extract descriptions from hgnc
+        hgnc_descriptions = {}
+        if hgnc_file:
+            hgnc_descriptions = self.get_hgnc_descriptions(hgnc_file)
+
+        # Create a hash of all valid taxon_ids for this species
+        species_id_to_tax = self.species_id_to_taxonomy(dbi)
+        species_id_to_tax.setdefault(species_id, []).append(species_id)
+        tax_ids = species_id_to_tax[species_id]
+        tax_to_species_id = {tax_id: species_id for tax_id in tax_ids}
+
+        xrefs = []
+
+        # Read file
+        for section in self.get_file_sections(file, "//\n"):
+            if len(section) == 1:
+                continue
+
+            entry = "".join(section)
+            xref = {}
+
+            # Extract the species taxon id
+            found = 0
+            match = re.search(r"OX\s+[a-zA-Z_]+=([0-9 ,]+).*;", entry)
+            if match:
+                ox = match.group(1)
+                for taxon_id_from_file in ox.split(", "):
+                    taxon_id_from_file = re.sub(r"\s", "", taxon_id_from_file)
+                    if tax_to_species_id.get(taxon_id_from_file):
+                        found = 1
+                        count += 1
+
+            # If no taxon_id's match, skip to next record
+            if not found:
+                continue
+
+            # Check for CC (caution) lines containing certain text
+            # If sequence is from Ensembl, do not use
+            ensembl_derived_protein = 0
+            if re.search(
+                r"CAUTION: The sequence shown here is derived from an Ensembl", entry
+            ):
+                ensembl_derived_protein = 1
+                ensembl_derived_protein_count += 1
+
+            # Extract ^AC lines and build list of accessions
+            accessions = []
+            accessions_only = re.findall(r"\nAC\s+(.+)", entry)
+            for accessions_line in accessions_only:
+                for acc in accessions_line.split(";"):
+                    acc = acc.strip()
+                    if acc:
+                        accessions.append(acc)
+            accession = accessions[0]
+
+            if accession.lower() == "unreviewed":
+                logging.warn(
+                    f"WARNING: entries with accession of {accession} not allowed, will be skipped"
+                )
+                continue
+
+            xref["ACCESSION"] = accession
+            xref["INFO_TYPE"] = "SEQUENCE_MATCH"
+            xref["SYNONYMS"] = []
+            for i in range(1, len(accessions)):
+                xref["SYNONYMS"].append(accessions[i])
+
+            sp_type = re.search(r"ID\s+(\w+)\s+(\w+)", entry).group(2)
+            protein_evidence_code = re.search(r"PE\s+(\d+)", entry).group(1)
+            version = re.search(r"DT\s+\d+-\w+-\d+, entry version (\d+)", entry).group(
+                1
+            )
+
+            # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
+            if re.search(r"^Reviewed", sp_type, re.IGNORECASE):
+                xref["SOURCE_ID"] = source_ids["sp_source_id"]
+                counts["num_sp"] += 1
+            elif re.search(r"Unreviewed", sp_type, re.IGNORECASE):
+                # Use normal source only if it is PE levels 1 & 2
+                if protein_evidence_code and int(protein_evidence_code) < 3:
+                    xref["SOURCE_ID"] = source_ids["sptr_source_id"]
+                    counts["num_sptr"] += 1
+                else:
+                    xref["SOURCE_ID"] = source_ids["sptr_non_display_source_id"]
+                    counts["num_sptr_non_display"] += 1
+            else:
+                continue
+
+            # Some straightforward fields
+            xref["LABEL"] = f"{accession}.{version}"
+            xref["VERSION"] = version
+            xref["SPECIES_ID"] = species_id
+            xref["SEQUENCE_TYPE"] = "peptide"
+            xref["STATUS"] = "experimental"
+            xref["DEPENDENT_XREFS"] = []
+            xref["DIRECT_XREFS"] = []
+
+            # Extract ^DE lines only and build cumulative description string
+            description = ""
+            description_lines = re.findall(r"\nDE\s+(.+)", entry)
+            for line in description_lines:
+                match = re.search(r"RecName: Full=(.*);", line)
+                if match:
+                    if description:
+                        description += "; "
+                    description += match.group(1)
+                else:
+                    match = re.search(r"SubName: Full=(.*);", line)
+                    if match:
+                        if description:
+                            description += "; "
+                        description += match.group(1)
+
+                description = re.sub(r"^\s*", "", description)
+                description = re.sub(r"\s*$", "", description)
+                description = re.sub(r"\s*\{ECO:.*?\}", "", description)
+
+                # Parse the EC_NUMBER line, only for S.cerevisiae for now
+                if re.search(r"EC=", line) and species_id == "4932":
+                    # Get the EC Number and make it an xref for S.cer if any
+                    EC = re.search(r"\s*EC=([^;]+);", line).group(1)
+
+                    dependent = {}
+                    dependent["LABEL"] = EC
+                    dependent["ACCESSION"] = EC
+                    dependent["SOURCE_NAME"] = "EC_NUMBER"
+                    dependent["SOURCE_ID"] = dependent_sources["EC_NUMBER"]
+                    dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
+                    xref["DEPENDENT_XREFS"].append(dependent)
+                    dependent_xrefs_counts["EC_NUMBER"] = (
+                        dependent_xrefs_counts.get("EC_NUMBER", 0) + 1
+                    )
+
+            xref["DESCRIPTION"] = description
+
+            # Extract sequence
+            sequence = re.search(r"SQ\s+(.+)", entry, flags=re.DOTALL).group(1)
+            sequence = re.sub(r"\n", "", sequence)
+            sequence = re.sub(r"\/\/", "", sequence)
+            sequence = re.sub(r"\s", "", sequence)
+            sequence = re.sub(r"^.*;", "", sequence)
+            xref["SEQUENCE"] = sequence
+
+            # Extract gene names
+            gene_names = re.findall(r"\nGN\s+(.+)", entry)
+            gene_names = " ".join(gene_names).split(";")
+
+            # Do not allow the addition of UniProt Gene Name dependent Xrefs
+            # if the protein was imported from Ensembl. Otherwise we will
+            # re-import previously set symbols
+            if not ensembl_derived_protein:
+                dependent = {}
+                name_found = 0
+                gene_name = None
+                dep_synonyms = []
+                for line in gene_names:
+                    line = line.strip()
+
+                    if not re.search(r"Name=", line) and not re.search(
+                        r"Synonyms=", line
+                    ):
+                        continue
+
+                    match = re.search(r"Name=([A-Za-z0-9_\-\.\s]+)", line)
+                    if match and not name_found:
+                        gene_name = match.group(1).rstrip()
+                        gene_name = re.sub(r"\nGN", "", gene_name)
+                        name_found = 1
+
+                    match = re.search(r"Synonyms=(.*)", line)
+                    if match:
+                        synonym = match.group(1)
+                        synonym = re.sub(r"\{.*?\}", "", synonym)
+                        synonym = re.sub(r"\s+$", "", synonym)
+                        synonym = re.sub(r"\s*,\s*", ",", synonym)
+                        synonyms = synonym.split(",")
+                        for synonym in synonyms:
+                            if synonym not in dep_synonyms:
+                                dep_synonyms.append(synonym)
+
+                if gene_name:
+                    dependent["LABEL"] = gene_name
+                    dependent["ACCESSION"] = xref["ACCESSION"]
+                    dependent["SOURCE_NAME"] = "Uniprot_gn"
+                    dependent["SOURCE_ID"] = dependent_sources["Uniprot_gn"]
+                    dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
+                    dependent["SYNONYMS"] = dep_synonyms
+                    if hgnc_file and hgnc_descriptions.get(gene_name) is not None:
+                        dependent["DESCRIPTION"] = hgnc_descriptions[gene_name]
+                    xref["DEPENDENT_XREFS"].append(dependent)
+                    dependent_xrefs_counts["Uniprot_gn"] = (
+                        dependent_xrefs_counts.get("Uniprot_gn", 0) + 1
+                    )
+
+            # Dependent xrefs - only store those that are from sources listed in the source table
+            deps = re.findall(r"\n(DR\s+.+)", entry)
+
+            seen = {}
+            for dep in deps:
+                match = re.search(r"^DR\s+(.+)", dep)
+                if match:
+                    vals = re.split(r";\s*", match.group(1))
+                    source = vals[0]
+                    acc = vals[1]
+                    extra = []
+                    if len(vals) > 2:
+                        extra = vals[2 : len(vals)]
+
+                    # Skip external sources obtained through other files
+                    if re.search(
+                        r"^(GO|UniGene|RGD|CCDS|IPI|UCSC|SGD|HGNC|MGI|VGNC|Orphanet|ArrayExpress|GenomeRNAi|EPD|Xenbase|Reactome|MIM|GeneCards)",
+                        source,
+                    ):
+                        continue
+
+                    # If mapped to Ensembl, add as direct xref
+                    if source == "Ensembl":
+                        direct = {}
+                        isoform = {}
+
+                        stable_id = extra[0]
+                        stable_id = re.sub(r"\.[0-9]+", "", stable_id)
+                        direct["STABLE_ID"] = stable_id
+                        direct["ENSEMBL_TYPE"] = "Translation"
+                        direct["LINKAGE_TYPE"] = "DIRECT"
+                        if xref["SOURCE_ID"] == source_ids["sp_source_id"]:
+                            direct["SOURCE_ID"] = source_ids["sp_direct_source_id"]
+                            counts["num_direct_sp"] += 1
+                        else:
+                            direct["SOURCE_ID"] = source_ids["sptr_direct_source_id"]
+                            counts["num_direct_sptr"] += 1
+                        xref["DIRECT_XREFS"].append(direct)
+
+                        match = re.search(r"(%s-[0-9]+)" % accession, extra[1])
+                        if match:
+                            isoform = match.group(1)
+                            self.add_to_direct_xrefs(
+                                {
+                                    "stable_id": stable_id,
+                                    "ensembl_type": "translation",
+                                    "accession": isoform,
+                                    "label": isoform,
+                                    "source_id": source_ids["isoform_source_id"],
+                                    "linkage": "DIRECT",
+                                    "species_id": species_id,
+                                },
+                                dbi,
+                            )
+                            counts["num_isoform"] += 1
+
+                    # Create dependent xref structure & store it
+                    if dependent_sources.get(source):
+                        dependent = {}
+
+                        dependent["SOURCE_NAME"] = source
+                        dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
+                        dependent["SOURCE_ID"] = dependent_sources[source]
+                        dependent["ACCESSION"] = acc
+
+                        if not seen.get(f"{source}:{acc}"):
+                            xref["DEPENDENT_XREFS"].append(dependent)
+                            dependent_xrefs_counts[source] = (
+                                dependent_xrefs_counts.get(source, 0) + 1
+                            )
+                            seen[f"{source}:{acc}"] = 1
+
+                        if re.search(r"EMBL", dep) and not re.search(r"ChEMBL", dep):
+                            protein_id = extra[0]
+                            if protein_id != "-" and not seen.get(
+                                f"{source}:{protein_id}"
+                            ):
+                                dependent = {}
+
+                                dependent["SOURCE_NAME"] = source
+                                dependent["SOURCE_ID"] = dependent_sources["protein_id"]
+                                dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
+                                dependent["LABEL"] = protein_id
+                                dependent["ACCESSION"] = re.search(
+                                    r"([^.]+)\.([^.]+)", protein_id
+                                ).group(1)
+                                xref["DEPENDENT_XREFS"].append(dependent)
+                                dependent_xrefs_counts[source] = (
+                                    dependent_xrefs_counts.get(source, 0) + 1
+                                )
+                                seen[f"{source}:{protein_id}"] = 1
+
+            xrefs.append(xref)
+
+            if count > 1000:
+                self.upload_xref_object_graphs(xrefs, dbi)
+                count = 0
+                xrefs.clear()
+
+        if len(xrefs) > 0:
+            self.upload_xref_object_graphs(xrefs, dbi)
+
+        result_message = f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, and {counts["num_sptr_non_display"]} SPTrEMBL xrefs with protein evidence codes > 2 from {file}\n'
+        result_message += f'Added {counts["num_direct_sp"]} direct SwissProt xrefs and {counts["num_direct_sptr"]} direct SPTrEMBL xrefs\n'
+        result_message += f'Added {counts["num_isoform"]} direct isoform xrefs\n'
+        result_message += f"Skipped {ensembl_derived_protein_count} ensembl annotations as Gene names\n"
+
+        result_message += f"Added the following dependent xrefs:\n"
+        for xref_source, xref_count in dependent_xrefs_counts.items():
+            result_message += f"\t{xref_source}\t{xref_count}\n"
+
+        return result_message
+
+    def get_hgnc_descriptions(self, hgnc_file: str) -> Dict[str, str]:
+        descriptions = {}
+
+        # Make sure the file is utf8
+        hgnc_file = codecs.encode(hgnc_file, "utf-8").decode("utf-8")
+        hgnc_file = re.sub(r'"', '', hgnc_file)
+
+        hgnc_io = self.get_filehandle(hgnc_file)
+        csv_reader = csv.DictReader(hgnc_io, delimiter="\t")
+
+        # Read lines
+        for line in csv_reader:
+            gene_name = line["Approved symbol"]
+            description = line["Approved name"]
+
+            descriptions[gene_name] = description
+
+        hgnc_io.close()
+
+        return descriptions
\ No newline at end of file
diff --git a/src/python/ensembl/production/xrefs/parsers/VGNCParser.py b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py
new file mode 100644
index 000000000..21cb13d58
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py
@@ -0,0 +1,93 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for VGNC source (uses HGNC Parser as parent)."""
+
+from ensembl.production.xrefs.parsers.HGNCParser import *
+
+
+class VGNCParser(HGNCParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Create a hash of all valid taxon_ids for this species
+        species_id_to_tax = self.species_id_to_taxonomy(xref_dbi)
+        species_id_to_tax.setdefault(species_id, []).append(species_id)
+
+        tax_ids = species_id_to_tax[species_id]
+        tax_to_species_id = {tax_id: species_id for tax_id in tax_ids}
+
+        # Open the vgnc file
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.DictReader(file_io, delimiter="\t")
+
+        # Check if header has required columns
+        required_columns = [
+            "taxon_id",
+            "ensembl_gene_id",
+            "vgnc_id",
+            "symbol",
+            "name",
+            "alias_symbol",
+            "prev_symbol",
+        ]
+        if not set(required_columns).issubset(set(csv_reader.fieldnames)):
+            raise IOError(f"Can't find required columns in VGNC file '{file}'")
+
+        # Read lines
+        count = 0
+        for line in csv_reader:
+            # Skip data for other species
+            if not tax_to_species_id.get(line["taxon_id"]):
+                continue
+
+            # Add ensembl direct xref
+            if line["ensembl_gene_id"]:
+                self.add_to_direct_xrefs(
+                    {
+                        "stable_id": line["ensembl_gene_id"],
+                        "ensembl_type": "gene",
+                        "accession": line["vgnc_id"],
+                        "label": line["symbol"],
+                        "description": line["name"],
+                        "source_id": source_id,
+                        "species_id": species_id,
+                    },
+                    xref_dbi,
+                )
+
+                self.add_synonyms_for_hgnc(
+                    {
+                        "source_id": source_id,
+                        "name": line["vgnc_id"],
+                        "species_id": species_id,
+                        "dead": line["alias_symbol"],
+                        "alias": line["prev_symbol"],
+                    },
+                    xref_dbi,
+                )
+
+                count += 1
+
+        file_io.close()
+
+        result_message = f"Loaded a total of {count} VGNC xrefs"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py
new file mode 100644
index 000000000..38c8ccbda
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py
@@ -0,0 +1,76 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for Xenbase source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class XenopusJamboreeParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        count = 0
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.reader(file_io, delimiter="\t")
+
+        # Read lines
+        for line in csv_reader:
+            accession = line[0]
+            label = line[1]
+            desc = line[2]
+            stable_id = line[3]
+
+            # If there is a description, trim it a bit
+            if desc:
+                desc = self.parse_description(desc)
+
+            if label == "unnamed":
+                label = accession
+
+            self.add_to_direct_xrefs(
+                {
+                    "stable_id": stable_id,
+                    "ensembl_type": "gene",
+                    "accession": accession,
+                    "label": label,
+                    "description": desc,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                },
+                xref_dbi,
+            )
+            count += 1
+
+        file_io.close()
+
+        result_message = f"{count} XenopusJamboreeParser xrefs succesfully parsed"
+
+        return 0, result_message
+
+    def parse_description(self, description: str) -> str:
+        # Remove some provenance information encoded in the description
+        description = re.sub(r"\s*\[.*\]", "", description)
+
+        # Remove labels of type 5 of 14 from the description
+        description = re.sub(r",\s+\d+\s+of\s+\d+", "", description)
+
+        return description
diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py
new file mode 100644
index 000000000..4e703788a
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py
@@ -0,0 +1,62 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for ZFIN Descriptions."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class ZFINDescParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        count = 0
+        withdrawn = 0
+
+        file_io = self.get_filehandle(file)
+        csv_reader = csv.DictReader(file_io, delimiter="\t")
+        csv_reader.fieldnames = ["zfin", "desc", "label", "extra1", "extra2"]
+
+        # Read lines
+        for line in csv_reader:
+            # Skip if WITHDRAWN: this precedes both desc and label
+            if re.search(r"\A WITHDRAWN:", line["label"]):
+                withdrawn += 1
+            else:
+                xref_id = self.add_xref(
+                    {
+                        "accession": line["zfin"],
+                        "label": line["label"],
+                        "description": line["desc"],
+                        "source_id": source_id,
+                        "species_id": species_id,
+                        "info_type": "MISC",
+                    },
+                    xref_dbi,
+                )
+                count += 1
+
+        file_io.close()
+
+        result_message = (
+            f"{count} ZFINDesc xrefs added, {withdrawn} withdrawn entries ignored"
+        )
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
new file mode 100644
index 000000000..8734d62ca
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
@@ -0,0 +1,169 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for ZFIN source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class ZFINParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id  = args["source_id"]
+        species_id = args["species_id"]
+        file       = args["file"]
+        xref_dbi   = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Get the ZFIN source ids
+        direct_src_id = self.get_source_id_for_source_name(
+            "ZFIN_ID", xref_dbi, "direct"
+        )
+        dependent_src_id = self.get_source_id_for_source_name(
+            "ZFIN_ID", xref_dbi, "uniprot/refseq"
+        )
+        description_src_id = self.get_source_id_for_source_name(
+            "ZFIN_ID", xref_dbi, "description_only"
+        )
+
+        # Get the ZFIN descriptions
+        description = {}
+        query = select(XrefUORM.accession, XrefUORM.description).where(
+            XrefUORM.source_id == description_src_id
+        )
+        for row in xref_dbi.execute(query).mappings().all():
+            if row.description:
+                description[row.accession] = row.description
+
+        # Get the Uniprot and RefSeq accessions
+        swiss = self.get_valid_codes("uniprot/swissprot", species_id, xref_dbi)
+        refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+
+        file_dir = os.path.dirname(file)
+        counts = {"direct": 0, "uniprot": 0, "refseq": 0, "synonyms": 0, "mismatch": 0}
+
+        # Process ZFIN to ensEMBL mappings
+        zfin = {}
+        zfin_io = self.get_filehandle(os.path.join(file_dir, "ensembl_1_to_1.txt"))
+        zfin_csv_reader = csv.DictReader(zfin_io, delimiter="\t", strict=True)
+        zfin_csv_reader.fieldnames = ["zfin", "so", "label", "ensembl_id"]
+        for line in zfin_csv_reader:
+            self.add_to_direct_xrefs(
+                {
+                    "stable_id": line["ensembl_id"],
+                    "ensembl_type": "gene",
+                    "accession": line["zfin"],
+                    "label": line["label"],
+                    "description": description.get(line["zfin"]),
+                    "source_id": direct_src_id,
+                    "species_id": species_id,
+                },
+                xref_dbi,
+            )
+
+            zfin[line["zfin"]] = 1
+            counts["direct"] += 1
+
+        zfin_io.close()
+
+        # Process ZFIN to Uniprot mappings
+        swissprot_io = self.get_filehandle(os.path.join(file_dir, "uniprot.txt"))
+        swissprot_csv_reader = csv.DictReader(swissprot_io, delimiter="\t", strict=True)
+        swissprot_csv_reader.fieldnames = ["zfin", "so", "label", "acc"]
+        for line in swissprot_csv_reader:
+            if swiss.get(line["acc"]) and not zfin.get(line["zfin"]):
+                for xref_id in swiss[line["acc"]]:
+                    self.add_dependent_xref(
+                        {
+                            "master_xref_id": xref_id,
+                            "accession": line["zfin"],
+                            "label": line["label"],
+                            "description": description.get(line["zfin"]),
+                            "source_id": dependent_src_id,
+                            "species_id": species_id,
+                        },
+                        xref_dbi,
+                    )
+                    counts["uniprot"] += 1
+            else:
+                counts["mismatch"] += 1
+
+        swissprot_io.close()
+
+        # Process ZFIN to RefSeq mappings
+        refseq_io = self.get_filehandle(os.path.join(file_dir, "refseq.txt"))
+        refseq_csv_reader = csv.DictReader(refseq_io, delimiter="\t", strict=True)
+        refseq_csv_reader.fieldnames = ["zfin", "so", "label", "acc"]
+        for line in refseq_csv_reader:
+            # Ignore mappings to predicted RefSeq
+            if (
+                re.search(r"^XP_", line["acc"])
+                or re.search(r"^XM_", line["acc"])
+                or re.search(r"^XR_", line["acc"])
+            ):
+                continue
+
+            if refseq.get(line["acc"]) and not zfin.get(line["zfin"]):
+                for xref_id in refseq[line["acc"]]:
+                    self.add_dependent_xref(
+                        {
+                            "master_xref_id": xref_id,
+                            "accession": line["zfin"],
+                            "label": line["label"],
+                            "description": description.get(line["zfin"]),
+                            "source_id": source_id,
+                            "species_id": species_id,
+                        },
+                        xref_dbi,
+                    )
+                    counts["refseq"] += 1
+            else:
+                counts["mismatch"] += 1
+
+        refseq_io.close()
+
+        # Get the added ZFINs added
+        zfin = self.get_valid_codes("zfin", species_id, xref_dbi)
+
+        sources = []
+        query = select(SourceUORM.source_id).where(SourceUORM.name.like("ZFIN_ID"))
+        for row in xref_dbi.execute(query).fetchall():
+            sources.append(row[0])
+
+        # Process the synonyms
+        aliases_io = self.get_filehandle(os.path.join(file_dir, "aliases.txt"))
+        aliases_csv_reader = csv.DictReader(aliases_io, delimiter="\t", strict=True)
+        aliases_csv_reader.fieldnames = ["acc", "cur_name", "cur_symbol", "syn", "so"]
+        for line in aliases_csv_reader:
+            if zfin.get(line["acc"]):
+                synonym = (
+                    unicodedata.normalize("NFKD", line["syn"])
+                    .encode("ascii", "namereplace")
+                    .decode("ascii")
+                )
+                self.add_to_syn_for_mult_sources(
+                    line["acc"], sources, synonym, species_id, xref_dbi
+                )
+                counts["synonyms"] += 1
+
+        aliases_io.close()
+
+        result_message = f"{counts['direct']} direct ZFIN xrefs added and\n"
+        result_message += f"\t{counts['uniprot']} dependent xrefs from UniProt added\n"
+        result_message += f"\t{counts['refseq']} dependent xrefs from RefSeq added\n"
+        result_message += f"\t{counts['mismatch']} dependents ignored\n"
+        result_message += f"\t{counts['synonyms']} synonyms loaded"
+
+        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/__init__.py b/src/python/ensembl/production/xrefs/parsers/__init__.py
new file mode 100644
index 000000000..e58354b3f
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref parsers modules."""
diff --git a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
new file mode 100644
index 000000000..dcba51ccb
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
@@ -0,0 +1,113 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for miRBase source."""
+
+from ensembl.production.xrefs.parsers.BaseParser import *
+
+
+class miRBaseParser(BaseParser):
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id    = args["source_id"]
+        species_id   = args["species_id"]
+        species_name = args["species_name"]
+        file         = args["file"]
+        xref_dbi     = args["xref_dbi"]
+
+        if not source_id or not species_id or not file:
+            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+
+        # Get the species name(s)
+        species_to_names = self.species_id_to_names(xref_dbi)
+        if species_name:
+            species_to_names.setdefault(species_id, []).append(species_name)
+        if not species_to_names.get(species_id):
+            return 0, "Skipped. Could not find species ID to name mapping"
+
+        names = species_to_names[species_id]
+        name_to_species_id = {name: species_id for name in names}
+
+        xrefs = self.create_xrefs(source_id, file, species_id, name_to_species_id)
+        if not xrefs:
+            return 0, "No xrefs added"
+
+        self.upload_xref_object_graphs(xrefs, xref_dbi)
+
+        result_message = "Read %d xrefs from %s" % (len(xrefs), file)
+
+        return 0, result_message
+
+    def create_xrefs(self, source_id: int, file: str, species_id: int, name_to_species_id: Dict[str, int]) -> List[Dict[str, Any]]:
+        xrefs = []
+
+        # Read mirbase file
+        for section in self.get_file_sections(file, "//\n"):
+            if len(section) == 1:
+                continue
+
+            entry = "".join(section)
+            if not entry:
+                continue
+
+            xref = {}
+
+            (header, sequence) = re.split(r"\nSQ", entry, 2)
+            species = None
+
+            # Extract sequence
+            if sequence:
+                seq_lines = sequence.split("\n")
+                seq_lines.pop(0)
+
+                sequence = "".join(seq_lines)
+                sequence = sequence.upper()
+                sequence = re.sub("U", "T", sequence)
+                sequence = re.sub(r"[\d+,\s+]", "", sequence)
+
+            # Extract name, accession, and description
+            name = re.search(r"^ID\s+(\S+)\s+", header, flags=re.MULTILINE).group(1)
+            accession = re.search(r"^AC\s+(\S+);\s+", header, flags=re.MULTILINE).group(
+                1
+            )
+            description = re.search(
+                r"^DE\s+(.+)\s+stem(-|\s)loop", header, flags=re.MULTILINE
+            ).group(1)
+
+            # Format description and extract species name
+            if description:
+                description_parts = re.split(r"\s+", description)
+                description_parts.pop()
+                species = " ".join(description_parts)
+                species = species.lower()
+                species = re.sub(" ", "_", species)
+
+            # If no species match, skip to next record
+            species_id_check = name_to_species_id.get(species)
+            if not species_id_check:
+                continue
+
+            if species_id and species_id == species_id_check:
+                xref = {
+                    "SEQUENCE_TYPE": "dna",
+                    "STATUS": "experimental",
+                    "SOURCE_ID": source_id,
+                    "ACCESSION": accession,
+                    "LABEL": name,
+                    "DESCRIPTION": name,
+                    "SEQUENCE": sequence,
+                    "SPECIES_ID": species_id,
+                }
+                xrefs.append(xref)
+
+        return xrefs

From 9047a9bcb4cdfdf09ab5b2a63df2dba0a64a6119 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 28 Oct 2024 14:49:18 +0000
Subject: [PATCH 02/12] Parsers: optimizations + unit tests

---
 scripts/xrefs/refseq_coordinate_parser.pl     |   2 +-
 .../xrefs/parsers/ArrayExpressParser.py       | 158 +++--
 .../production/xrefs/parsers/BaseParser.py    |  51 +-
 .../production/xrefs/parsers/CCDSParser.py    | 107 ++--
 .../production/xrefs/parsers/DBASSParser.py   | 139 ++--
 .../xrefs/parsers/EntrezGeneParser.py         | 126 ++--
 .../production/xrefs/parsers/HGNCParser.py    | 484 ++++++--------
 .../production/xrefs/parsers/HPAParser.py     |  60 +-
 .../xrefs/parsers/JGI_ProteinParser.py        |  59 +-
 .../production/xrefs/parsers/MGIDescParser.py | 107 ++++
 .../production/xrefs/parsers/MGIParser.py     |  53 +-
 .../xrefs/parsers/MGI_CCDS_Parser.py          | 107 ----
 .../xrefs/parsers/MGI_Desc_Parser.py          | 101 ---
 .../production/xrefs/parsers/MIMParser.py     | 138 ++--
 .../xrefs/parsers/Mim2GeneParser.py           | 125 ++--
 .../production/xrefs/parsers/RFAMParser.py    | 211 ++++---
 .../production/xrefs/parsers/RGDParser.py     | 130 ++--
 .../xrefs/parsers/ReactomeParser.py           | 275 ++++----
 .../xrefs/parsers/RefSeqCoordinateParser.py   | 125 ++--
 .../xrefs/parsers/RefSeqGPFFParser.py         | 341 ----------
 .../production/xrefs/parsers/RefSeqParser.py  | 316 ++++++++++
 .../production/xrefs/parsers/UCSCParser.py    | 145 ++---
 .../production/xrefs/parsers/UniProtParser.py | 575 ++++++++---------
 .../production/xrefs/parsers/VGNCParser.py    |  89 +--
 .../xrefs/parsers/XenopusJamboreeParser.py    |  92 +--
 .../xrefs/parsers/ZFINDescParser.py           |  74 +--
 .../production/xrefs/parsers/ZFINParser.py    | 249 ++++----
 .../production/xrefs/parsers/miRBaseParser.py |  79 ++-
 src/python/test/xrefs/__init__.py             |  15 +
 src/python/test/xrefs/conftest.py             | 135 ++++
 .../test/xrefs/parsers/flatfiles/dbass3.txt   |   9 +
 .../test/xrefs/parsers/flatfiles/dbass5.txt   |   8 +
 .../xrefs/parsers/flatfiles/entrezgene.txt    |  13 +
 .../test/xrefs/parsers/flatfiles/hgnc.txt     |  21 +
 .../test/xrefs/parsers/flatfiles/hpa.txt      |  11 +
 .../xrefs/parsers/flatfiles/jgi_protein.fasta | 108 ++++
 .../test/xrefs/parsers/flatfiles/mgi.txt      |  10 +
 .../test/xrefs/parsers/flatfiles/mgi_desc.txt |  11 +
 .../test/xrefs/parsers/flatfiles/mim.txt      | 122 ++++
 .../test/xrefs/parsers/flatfiles/mim2gene.txt |  10 +
 .../test/xrefs/parsers/flatfiles/mirbase.txt  | 506 +++++++++++++++
 .../parsers/flatfiles/reactome_UniProt.txt    |   8 +
 .../parsers/flatfiles/reactome_ensembl.txt    |  14 +
 .../parsers/flatfiles/reactome_release.txt    |   1 +
 .../parsers/flatfiles/refseq_protein.txt      | 291 +++++++++
 .../parsers/flatfiles/refseq_release.txt      |  94 +++
 .../xrefs/parsers/flatfiles/refseq_rna.txt    | 508 +++++++++++++++
 .../test/xrefs/parsers/flatfiles/rfam.txt     | 381 +++++++++++
 .../test/xrefs/parsers/flatfiles/rgd.txt      |  98 +++
 .../test/xrefs/parsers/flatfiles/ucsc.txt     |  10 +
 .../parsers/flatfiles/uniprot_release.txt     |   3 +
 .../parsers/flatfiles/uniprot_swissprot.txt   | 591 ++++++++++++++++++
 .../parsers/flatfiles/uniprot_trembl.txt      | 570 +++++++++++++++++
 .../test/xrefs/parsers/flatfiles/vgnc.txt     |  11 +
 .../parsers/flatfiles/xenopus_jamboree.txt    |  12 +
 .../xrefs/parsers/flatfiles/zfin/aliases.txt  |  10 +
 .../parsers/flatfiles/zfin/ensembl_1_to_1.txt |  10 +
 .../xrefs/parsers/flatfiles/zfin/refseq.txt   |  10 +
 .../xrefs/parsers/flatfiles/zfin/uniprot.txt  |  10 +
 .../xrefs/parsers/flatfiles/zfin_desc.txt     |   9 +
 .../xrefs/parsers/test_arrayexpress_parser.py | 110 ++++
 .../test/xrefs/parsers/test_ccds_parser.py    |  91 +++
 .../test/xrefs/parsers/test_dbass_parser.py   | 147 +++++
 .../xrefs/parsers/test_entrezgene_parser.py   | 157 +++++
 .../test/xrefs/parsers/test_hgnc_parser.py    | 182 ++++++
 .../test/xrefs/parsers/test_hpa_parser.py     | 132 ++++
 .../xrefs/parsers/test_jgi_protein_parser.py  |  61 ++
 .../xrefs/parsers/test_mgi_desc_parser.py     | 148 +++++
 .../test/xrefs/parsers/test_mgi_parser.py     |  84 +++
 .../xrefs/parsers/test_mim2gene_parser.py     | 250 ++++++++
 .../test/xrefs/parsers/test_mim_parser.py     | 126 ++++
 .../test/xrefs/parsers/test_mirbase_parser.py | 111 ++++
 .../xrefs/parsers/test_reactome_parser.py     | 166 +++++
 .../test/xrefs/parsers/test_refseq_parser.py  | 243 +++++++
 .../test/xrefs/parsers/test_rfam_parser.py    | 130 ++++
 .../test/xrefs/parsers/test_rgd_parser.py     | 126 ++++
 .../test/xrefs/parsers/test_ucsc_parser.py    |  89 +++
 .../test/xrefs/parsers/test_uniprot_parser.py | 181 ++++++
 .../test/xrefs/parsers/test_vgnc_parser.py    |  96 +++
 .../parsers/test_xenopus_jamboree_parser.py   |  78 +++
 .../xrefs/parsers/test_zfin_desc_parser.py    |  63 ++
 .../test/xrefs/parsers/test_zfin_parser.py    | 165 +++++
 src/python/test/xrefs/pytest.ini              |   2 +
 src/python/test/xrefs/test_helpers.py         |  80 +++
 84 files changed, 8813 insertions(+), 2343 deletions(-)
 create mode 100644 src/python/ensembl/production/xrefs/parsers/MGIDescParser.py
 delete mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py
 delete mode 100644 src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py
 delete mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py
 create mode 100644 src/python/ensembl/production/xrefs/parsers/RefSeqParser.py
 create mode 100644 src/python/test/xrefs/__init__.py
 create mode 100644 src/python/test/xrefs/conftest.py
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/dbass3.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/dbass5.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/entrezgene.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/hgnc.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/hpa.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/mgi.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/mim.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/mim2gene.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/mirbase.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/reactome_release.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/refseq_release.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/rfam.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/rgd.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/ucsc.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/vgnc.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt
 create mode 100644 src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt
 create mode 100644 src/python/test/xrefs/parsers/test_arrayexpress_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_ccds_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_dbass_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_entrezgene_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_hgnc_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_hpa_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_jgi_protein_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_mgi_desc_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_mgi_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_mim2gene_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_mim_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_mirbase_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_reactome_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_refseq_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_rfam_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_rgd_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_ucsc_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_uniprot_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_vgnc_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_zfin_desc_parser.py
 create mode 100644 src/python/test/xrefs/parsers/test_zfin_parser.py
 create mode 100644 src/python/test/xrefs/pytest.ini
 create mode 100644 src/python/test/xrefs/test_helpers.py

diff --git a/scripts/xrefs/refseq_coordinate_parser.pl b/scripts/xrefs/refseq_coordinate_parser.pl
index 808284ee4..dae228391 100644
--- a/scripts/xrefs/refseq_coordinate_parser.pl
+++ b/scripts/xrefs/refseq_coordinate_parser.pl
@@ -99,7 +99,7 @@
 # Not all species have refseq_import data, skip if not found
 if (!defined $logic_name) {
   print STDERR "No data found for RefSeq_import, skipping import\n";;
-  return;
+  exit 1;
 }
 
 # Get otherfeatures chromosomes
diff --git a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
index 988b92ffa..53e78e887 100644
--- a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
@@ -14,33 +14,36 @@
 
 """Parser module for ArrayExpress source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import logging
+from typing import Dict, Any, Tuple, List, Optional
+from sqlalchemy import select
+from sqlalchemy.engine import URL
+from ftplib import FTP
 
+from ensembl.core.models import Gene as GeneORM
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class ArrayExpressParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id       = args["source_id"]
-        species_id      = args["species_id"]
-        species_name    = args["species_name"]
-        file            = args["file"]
-        dba             = args["dba"]
-        ensembl_release = args["ensembl_release"]
-        xref_dbi        = args["xref_dbi"]
-        verbose         = args.get("verbose", False)
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        species_name = args.get("species_name")
+        xref_file = args.get("file", "")
+        dba = args.get("dba")
+        ensembl_release = args.get("ensembl_release")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not source_id or not species_id:
+            raise AttributeError("Missing required arguments: source_id and species_id")
 
         # Extract db connection parameters from file name
-        project, db_user, db_host, db_port, db_name, db_pass = (
-            self.extract_params_from_string(
-                file, ["project", "user", "host", "port", "dbname", "pass"]
-            )
+        project, db_user, db_host, db_port, db_name, db_pass = self.extract_params_from_string(
+            xref_file, ["project", "user", "host", "port", "dbname", "pass"]
         )
-        if not db_user:
-            db_user = "ensro"
-        if not db_port:
-            db_port = "3306"
+        db_user = db_user or "ensro"
+        db_port = db_port or "3306"
 
         # Get the species name(s)
         species_id_to_names = self.species_id_to_names(xref_dbi)
@@ -49,67 +52,31 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         if not species_id_to_names.get(species_id):
             return 0, "Skipped. Could not find species ID to name mapping"
-        names = species_id_to_names[species_id]
 
         # Look up the species in ftp server and check if active
-        species_lookup = self._get_species()
-        active = self._is_active(species_lookup, names, verbose)
-        if not active:
+        species_lookup = self.get_species_from_ftp()
+        if not self.is_arryaexpress_active(species_lookup, species_id_to_names[species_id], verbose):
             return 0, "Skipped. ArrayExpress source not active for species"
 
         species_name = species_id_to_names[species_id][0]
 
         # Connect to the appropriate arrayexpress db
-        if db_host:
-            arrayexpress_db_url = URL.create(
-                "mysql", db_user, db_pass, db_host, db_port, db_name
-            )
-        elif project and project == "ensembl":
-            if verbose:
-                logging.info("Looking for db in mysql-ens-sta-1")
-            registry = "ensro@mysql-ens-sta-1:4519"
-            arrayexpress_db_url = self.get_db_from_registry(
-                species_name, "core", ensembl_release, registry
-            )
-        elif project and project == "ensemblgenomes":
-            if verbose:
-                logging.info(
-                    "Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2"
-                )
-            registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160"
-            arrayexpress_db_url = self.get_db_from_registry(
-                species_name, "core", ensembl_release, registry
-            )
-
-            if not arrayexpress_db_url:
-                registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275"
-                arrayexpress_db_url = self.get_db_from_registry(
-                    species_name, "core", ensembl_release, registry
-                )
-        elif dba:
-            arrayexpress_db_url = dba
-        else:
-            arrayexpress_db_url = None
+        arrayexpress_db_url = self.get_arrayexpress_db_url(
+            project, db_user, db_pass, db_host, db_port, db_name, species_name, ensembl_release, dba, verbose
+        )
 
         if not arrayexpress_db_url:
-            raise IOError(
-                f"Could not find ArrayExpress DB. Missing or unsupported project value. Supported values: ensembl, ensemblgenomes."
+            raise AttributeError(
+                "Could not find ArrayExpress DB. Missing or unsupported project value. Supported values: ensembl, ensemblgenomes."
             )
-        else:
-            if verbose:
-                logging.info(f"Found ArrayExpress DB: {arrayexpress_db_url}")
 
         xref_count = 0
 
-        db_engine = self.get_db_engine(arrayexpress_db_url)
-        with db_engine.connect() as arrayexpress_dbi:
-            query = select(GeneORM.stable_id).where(
-                GeneORM.biotype != "LRG_gene", GeneORM.is_current == 1
-            )
-            result = arrayexpress_dbi.execute(query).mappings().all()
+        # Get data from arrayexpress db
+        arrayexpress_data = self.get_arrayexpress_data(arrayexpress_db_url)
 
         # Add direct xref for every current gene found
-        for row in result:
+        for row in arrayexpress_data:
             xref_id = self.add_xref(
                 {
                     "accession": row.stable_id,
@@ -121,41 +88,64 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 xref_dbi,
             )
             self.add_direct_xref(xref_id, row.stable_id, "gene", "", xref_dbi)
-
             xref_count += 1
 
         result_message = f"Added {xref_count} DIRECT xrefs"
-
         return 0, result_message
 
-    def _get_species(self) -> Dict[str, int]:
+    def get_species_from_ftp(self) -> Dict[str, bool]:
         ftp_server = "ftp.ebi.ac.uk"
         ftp_dir = "pub/databases/microarray/data/atlas/bioentity_properties/ensembl"
 
         species_lookup = {}
 
-        ftp = FTP(ftp_server)
-        ftp.login("anonymous", "-anonymous@")
-        ftp.cwd(ftp_dir)
-        remote_files = ftp.nlst()
-        ftp.close()
+        with FTP(ftp_server) as ftp:
+            ftp.login("anonymous", "-anonymous@")
+            ftp.cwd(ftp_dir)
+            remote_files = ftp.nlst()
 
         for file in remote_files:
             species = file.split(".")[0]
-            species_lookup[species] = 1
+            species_lookup[species] = True
 
         return species_lookup
 
-    def _is_active(self, species_lookup: Dict[str, int], names: List[str], verbose: bool) -> bool:
-        # Loop through the names and aliases first. If we get a hit then great
-        active = False
+    def is_arryaexpress_active(self, species_lookup: Dict[str, bool], names: List[str], verbose: bool) -> bool:
         for name in names:
             if species_lookup.get(name):
                 if verbose:
-                    logging.info(
-                        f"Found ArrayExpress has declared the name {name}. This was an alias"
-                    )
-                active = True
-                break
+                    logging.info(f"Found ArrayExpress has declared the name {name}. This was an alias")
+                return True
+        return False
+
+    def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_host: str, db_port: str, db_name: str, species_name: str, ensembl_release: str, dba: str, verbose: bool) -> Optional[URL]:
+        if db_host:
+            return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name)
+        elif project == "ensembl":
+            if verbose:
+                logging.info("Looking for db in mysql-ens-sta-1")
+            registry = "ensro@mysql-ens-sta-1:4519"
+            return self.get_db_from_registry(species_name, "core", ensembl_release, registry)
+        elif project == "ensemblgenomes":
+            if verbose:
+                logging.info("Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2")
+            registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160"
+            db_url = self.get_db_from_registry(species_name, "core", ensembl_release, registry)
+            if not db_url:
+                registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275"
+                return self.get_db_from_registry(species_name, "core", ensembl_release, registry)
+            return db_url
+        elif dba:
+            return dba
 
-        return active
+        return None
+
+    def get_arrayexpress_data(self, arrayexpress_db_url: URL) -> List[Dict[str, Any]]:
+        db_engine = self.get_db_engine(arrayexpress_db_url)
+        with db_engine.connect() as arrayexpress_dbi:
+            query = select(GeneORM.stable_id).where(
+                GeneORM.biotype != "LRG_gene", GeneORM.is_current == 1
+            )
+            result = arrayexpress_dbi.execute(query).mappings().all()
+        
+        return result
diff --git a/src/python/ensembl/production/xrefs/parsers/BaseParser.py b/src/python/ensembl/production/xrefs/parsers/BaseParser.py
index 3ae7c2e2c..ad6440e37 100644
--- a/src/python/ensembl/production/xrefs/parsers/BaseParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/BaseParser.py
@@ -14,8 +14,27 @@
 
 """Base xref parser module to include all common functions used by xref parsers."""
 
-from ensembl.production.xrefs.Base import *
-
+import re
+from sqlalchemy import select, update, func
+from sqlalchemy.dialects.mysql import insert
+from sqlalchemy.engine import Connection
+from sqlalchemy.orm import aliased
+from typing import List, Dict, Any, Optional
+
+from ensembl.xrefs.xref_update_db_model import (
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    PrimaryXref as PrimaryXrefORM,
+    DependentXref as DependentXrefUORM,
+    GeneDirectXref as GeneDirectXrefORM,
+    TranscriptDirectXref as TranscriptDirectXrefORM,
+    TranslationDirectXref as TranslationDirectXrefORM,
+    Synonym as SynonymORM,
+    Pairs as PairsORM,
+    Species as SpeciesORM,
+)
+
+from ensembl.production.xrefs.Base import Base
 
 class BaseParser(Base):
     """Class to represent the base of xref parser modules. Inherits the xref Base class."""
@@ -67,9 +86,8 @@ def get_source_id_for_source_name(self, source_name: str, dbi: Connection, prior
             )
 
         result = dbi.execute(query)
-        if result:
-            source_id = result.scalar()
-        else:
+        source_id = result.scalar()
+        if source_id is None:
             raise KeyError(f"No source_id for source_name={source_name}")
 
         return source_id
@@ -117,11 +135,12 @@ def set_release(self, source_id: int, s_release: str, dbi: Connection) -> None:
         dbi: sqlalchemy.engine.Connection
             The database connection to update in
         """
-        dbi.execute(
-            update(SourceUORM)
-            .where(SourceUORM.source_id == source_id)
-            .values(source_release=s_release)
-        )
+        if s_release:
+            dbi.execute(
+                update(SourceUORM)
+                .where(SourceUORM.source_id == source_id)
+                .values(source_release=s_release)
+            )
 
     def upload_xref_object_graphs(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None:
         """Adds xref data into a database.
@@ -558,7 +577,9 @@ def add_dependent_xref_maponly(self, dependent_id: int, dependent_source_id: int
                     linkage_annotation=master_source_id,
                     linkage_source_id=dependent_source_id,
                 )
-                .prefix_with("IGNORE")
+                .on_duplicate_key_update(
+                    linkage_source_id=dependent_source_id
+                )
             )
 
             self._xref_dependent_mapped[index] = master_source_id
@@ -620,7 +641,9 @@ def add_synonym(self, xref_id: int, synonym: str, dbi: Connection) -> None:
         dbi.execute(
             insert(SynonymORM)
             .values(xref_id=xref_id, synonym=synonym)
-            .prefix_with("IGNORE")
+            .on_duplicate_key_update(
+                synonym=synonym
+            )
         )
 
     def get_ext_synonyms(self, source_name: str, dbi: Connection) -> Dict[str, List[str]]:
@@ -658,7 +681,7 @@ def get_ext_synonyms(self, source_name: str, dbi: Connection) -> Dict[str, List[
                 ext_syns.setdefault(row.label, []).append(row.synonym)
                 count += 1
 
-            seen[acc_syn] = 1
+            seen[acc_syn] = True
 
         return ext_syns
 
@@ -684,7 +707,7 @@ def build_dependent_mappings(self, source_id: int, dbi: Connection) -> None:
 
         for row in dbi.execute(query).mappings().all():
             self._xref_dependent_mapped[
-                row.master_xref_id + "|" + row.dependent_xref_id
+                f"{row.master_xref_id}|{row.dependent_xref_id}"
             ] = row.linkage_annotation
 
     def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]:
diff --git a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
index f2e258716..24d1e088c 100644
--- a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
@@ -14,30 +14,41 @@
 
 """Parser module for CCDS source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import logging
+from sqlalchemy import select
+from sqlalchemy.engine import URL
+from typing import Dict, Any, Tuple, List
 
+from ensembl.core.models import (
+    Transcript as TranscriptORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    ObjectXref as ObjectXrefCORM,
+)
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class CCDSParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        dba        = args["dba"]
-        xref_dbi   = args["xref_dbi"]
-        verbose    = args.get("verbose", False)
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file", "")
+        dba = args.get("dba")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id:
+            raise AttributeError("Missing required arguments: source_id and species_id")
 
         # Extract db connection parameters from file
         db_user = "ensro"
         db_host, db_port, db_name, db_pass = self.extract_params_from_string(
-            file, ["host", "port", "dbname", "pass"]
+            xref_file, ["host", "port", "dbname", "pass"]
         )
-        if not db_port:
-            db_port = "3306"
+        db_port = db_port or "3306"
 
         # Connect to the appropriate db
+        ccds_db_url = None
         if db_host:
             ccds_db_url = URL.create(
                 "mysql", db_user, db_pass, db_host, db_port, db_name
@@ -47,47 +58,36 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         if not ccds_db_url:
             return 1, "Could not find CCDS DB."
-        else:
-            if verbose:
-                logging.info(f"Found CCDS DB: {ccds_db_url}")
+        if verbose:
+            logging.info(f"Found CCDS DB: {ccds_db_url}")
 
         # Get data from ccds db
-        db_engine = self.get_db_engine(ccds_db_url)
-        with db_engine.connect() as ccds_dbi:
-            query = (
-                select(TranscriptORM.stable_id, XrefCORM.dbprimary_acc)
-                .where(
-                    XrefCORM.xref_id == ObjectXrefCORM.xref_id,
-                    ObjectXrefCORM.ensembl_object_type == "Transcript",
-                    ObjectXrefCORM.ensembl_id == TranscriptORM.transcript_id,
-                    ExternalDbORM.external_db_id == XrefCORM.external_db_id,
-                )
-                .filter(ExternalDbORM.db_name.like("Ens_%_transcript"))
-            )
-            result = ccds_dbi.execute(query).mappings().all()
+        ccds_data = self.get_ccds_data(ccds_db_url)
 
         xref_count, direct_count = 0, 0
         seen = {}
 
-        for row in result:
+        for row in ccds_data:
             stable_id = row.stable_id
             display_label = row.dbprimary_acc
 
-            (acc, version) = display_label.split(".")
-
-            if not seen.get(display_label):
-                xref_id = self.add_xref(
-                    {
-                        "accession": acc,
-                        "version": version,
-                        "label": display_label,
-                        "source_id": source_id,
-                        "species_id": species_id,
-                        "info_type": "DIRECT",
-                    },
-                    xref_dbi,
-                )
-
+            if "." in display_label:
+                acc, version = display_label.split(".")
+            else:
+                acc, version = display_label, None
+
+            if display_label not in seen:
+                xref_args = {
+                    "accession": acc,
+                    "label": display_label,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "DIRECT",
+                }
+                if version is not None:
+                    args["version"] = version
+
+                xref_id = self.add_xref(xref_args, xref_dbi)
                 xref_count += 1
                 seen[display_label] = xref_id
             else:
@@ -96,6 +96,23 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
             self.add_direct_xref(xref_id, stable_id, "Transcript", "", xref_dbi)
             direct_count += 1
 
-        result_message = f"Parsed CCDS identifiers from {file}, added {xref_count} xrefs and {direct_count} direct_xrefs"
+        result_message = f"Parsed CCDS identifiers, added {xref_count} xrefs and {direct_count} direct_xrefs"
 
         return 0, result_message
+
+    def get_ccds_data(self, ccds_db_url: str) -> List[Dict[str, Any]]:
+        db_engine = self.get_db_engine(ccds_db_url)
+        with db_engine.connect() as ccds_dbi:
+            query = (
+                select(TranscriptORM.stable_id, XrefCORM.dbprimary_acc)
+                .join(ObjectXrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id)
+                .join(TranscriptORM, ObjectXrefCORM.ensembl_id == TranscriptORM.transcript_id)
+                .join(ExternalDbORM, ExternalDbORM.external_db_id == XrefCORM.external_db_id)
+                .where(
+                    ObjectXrefCORM.ensembl_object_type == "Transcript",
+                    ExternalDbORM.db_name.like("Ens_%_transcript")
+                )
+            )
+            result = ccds_dbi.execute(query).mappings().all()
+
+        return result
\ No newline at end of file
diff --git a/src/python/ensembl/production/xrefs/parsers/DBASSParser.py b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py
index 9f3f6243a..0a453a029 100644
--- a/src/python/ensembl/production/xrefs/parsers/DBASSParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/DBASSParser.py
@@ -14,101 +14,84 @@
 
 """Parser module for DBASS sources."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-
-EXPECTED_NUMBER_OF_COLUMNS = 23
+import csv
+import re
+from typing import Any, Dict, Optional, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class DBASSParser(BaseParser):
+    EXPECTED_NUMBER_OF_COLUMNS = 23
+    SLASH_PATTERN = re.compile(r"(.*)\s?/\s?(.*)", re.IGNORECASE | re.DOTALL)
+    PARENS_PATTERN = re.compile(r"(.*)\s?\((.*)\)", re.IGNORECASE | re.DOTALL)
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args.get("source_id")
+        source_id = args.get("source_id")
         species_id = args.get("species_id")
-        xref_file  = args.get("file")
-        xref_dbi   = args.get("xref_dbi")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
         if not source_id or not species_id or not xref_file:
-            raise AttributeError("Need to pass source_id, species_id and file")
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"DBASS file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.reader(file_io)
+            header = next(csv_reader)
+            patterns = [r"^id$", r"^genesymbol$", None, r"^ensemblreference$"]
+            if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+                raise ValueError(f"Malformed or unexpected header in DBASS file {xref_file}")
 
-        file_io = self.get_filehandle(xref_file)
-        csv_reader = csv.reader(file_io)
+            processed_count, unmapped_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi)
 
-        # Check if header is valid
-        header = next(csv_reader)
-        patterns = [r"^id$", r"^genesymbol$", None, r"^ensemblreference$"]
-        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
-            raise IOError(f"Malformed or unexpected header in DBASS file {xref_file}")
+        result_message = f"{processed_count} direct xrefs successfully processed\n"
+        result_message += f"Skipped {unmapped_count} unmapped xrefs"
+        return 0, result_message
 
+    def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int]:
         processed_count = 0
         unmapped_count = 0
 
-        # Read lines
         for line in csv_reader:
             if not line:
                 continue
 
-            if len(line) < EXPECTED_NUMBER_OF_COLUMNS:
-                line_number = 2 + processed_count + unmapped_count
-                raise IOError(
-                    f"Line {line_number} of input file {xref_file} has an incorrect number of columns"
-                )
-
-            dbass_gene_id = line[0]
-            dbass_gene_name = line[1]
-            dbass_full_name = line[2]
-            ensembl_id = line[3]
-
-            # Do not attempt to create unmapped xrefs. Checking truthiness is good
-            # enough here because the only non-empty string evaluating as false is
-            # not a valid Ensembl stable ID.
-            if ensembl_id:
-                # DBASS files list synonyms in two ways: either "FOO (BAR)" (with or
-                # without space) or "FOO/BAR". Both forms are relevant to us.
-                match = re.search(
-                    r"(.*)\s?/\s?(.*)", dbass_gene_name, re.IGNORECASE | re.DOTALL
-                )
-                if match:
-                    first_gene_name = match.group(1)
-                    second_gene_name = match.group(2)
-                else:
-                    match = re.search(
-                        r"(.*)\s?\((.*)\)", dbass_gene_name, re.IGNORECASE | re.DOTALL
-                    )
-                    if match:
-                        first_gene_name = match.group(1)
-                        second_gene_name = match.group(2)
-                    else:
-                        first_gene_name = dbass_gene_name
-                        second_gene_name = None
-
-                label = first_gene_name
-                synonym = second_gene_name
-                ensembl_type = "gene"
-                version = "1"
-
-                xref_id = self.add_xref(
-                    {
-                        "accession": dbass_gene_id,
-                        "version": version,
-                        "label": label,
-                        "source_id": source_id,
-                        "species_id": species_id,
-                        "info_type": "DIRECT",
-                    },
-                    xref_dbi,
-                )
-
-                if synonym:
-                    self.add_synonym(xref_id, synonym, xref_dbi)
-
-                self.add_direct_xref(xref_id, ensembl_id, ensembl_type, "", xref_dbi)
-
-                processed_count += 1
-            else:
-                unmapped_count += 1
+            if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS:
+                raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns")
 
-        file_io.close()
+            dbass_gene_id, dbass_gene_name, dbass_full_name, ensembl_id = line[:4]
 
-        result_message = f"{processed_count} direct xrefs successfully processed\n"
-        result_message += f"Skipped {unmapped_count} unmapped xrefs"
+            if not ensembl_id.strip():
+                unmapped_count += 1
+                continue
 
-        return 0, result_message
+            first_gene_name, second_gene_name = self.extract_gene_names(dbass_gene_name)
+            xref_id = self.add_xref(
+                {
+                    "accession": dbass_gene_id,
+                    "version": "1",
+                    "label": first_gene_name,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "DIRECT",
+                },
+                xref_dbi,
+            )
+            self.add_direct_xref(xref_id, ensembl_id, "gene", "", xref_dbi)
+
+            if second_gene_name:
+                self.add_synonym(xref_id, second_gene_name, xref_dbi)
+
+            processed_count += 1
+
+        return processed_count, unmapped_count
+
+    def extract_gene_names(self, dbass_gene_name: str) -> Tuple[Optional[str], Optional[str]]:
+        match = self.SLASH_PATTERN.search(dbass_gene_name) or self.PARENS_PATTERN.search(dbass_gene_name)
+        if match:
+            return match.groups()
+        return dbass_gene_name, None
diff --git a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
index 33a7328a2..699c633ba 100644
--- a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
@@ -14,70 +14,84 @@
 
 """Parser module for EntrezGene and WikiGene sources."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-
-EXPECTED_NUMBER_OF_COLUMNS = 16
+import csv
+import logging
+import re
+from typing import Any, Dict, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class EntrezGeneParser(BaseParser):
+    EXPECTED_NUMBER_OF_COLUMNS = 16
+    SYNONYM_SPLITTER = re.compile(r"\|")
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
-        verbose    = args.get("verbose", False)
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
-
-        wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi)
-        if verbose:
-            logging.info(f"Wiki source id = {wiki_source_id}")
-
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(file_io, delimiter="\t")
-
-        # Check if header is valid
-        header = next(csv_reader)
-        patterns = [
-            r"\A[#]?\s*tax_id",
-            "geneid",
-            "symbol",
-            "locustag",
-            "synonyms",
-            "dbxrefs",
-            "chromosome",
-            "map_location",
-            "description",
-            "type_of_gene",
-            "symbol_from_nomenclature_authority",
-            "full_name_from_nomenclature_authority",
-            "nomenclature_status",
-            "other_designations",
-            "modification_date",
-            "feature_type",
-        ]
-        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
-            raise IOError(f"Malformed or unexpected header in EntrezGene file {file}")
-
-        xref_count = 0
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError("EntrezGene file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.reader(file_io, delimiter="\t")
+            header = next(csv_reader)
+            patterns = [
+                r"\A[#]?\s*tax_id$",
+                r"^geneid$",
+                r"^symbol$",
+                r"^locustag$",
+                r"^synonyms$",
+                r"^dbxrefs$",
+                r"^chromosome$",
+                r"^map_location$",
+                r"^description$",
+                r"^type_of_gene$",
+                r"^symbol_from_nomenclature_authority$",
+                r"^full_name_from_nomenclature_authority$",
+                r"^nomenclature_status$",
+                r"^other_designations$",
+                r"^modification_date$",
+                r"^feature_type$",
+            ]
+            if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+                raise ValueError(f"Malformed or unexpected header in EntrezGene file {xref_file}")
+
+            wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi)
+            if verbose:
+                logging.info(f"Wiki source id = {wiki_source_id}")
+
+            processed_count, syn_count = self.process_lines(csv_reader, source_id, species_id, wiki_source_id, xref_dbi)
+
+        result_message = f"{processed_count} EntrezGene Xrefs and {processed_count} WikiGene Xrefs added with {syn_count} synonyms"
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, wiki_source_id: int, xref_dbi: Connection) -> Tuple[int, int]:
+        processed_count = 0
         syn_count = 0
         seen = {}
 
-        # Read lines
         for line in csv_reader:
             if not line:
                 continue
 
-            tax_id = line[0]
+            if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS:
+                raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns")
+
+            tax_id = int(line[0])
             acc = line[1]
             symbol = line[2]
             synonyms = line[4]
             desc = line[8]
 
-            if tax_id != species_id:
-                continue
-            if seen.get(acc):
+            if tax_id != species_id or acc in seen:
                 continue
 
             xref_id = self.add_xref(
@@ -103,18 +117,14 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 xref_dbi,
             )
 
-            xref_count += 1
+            processed_count += 1
 
-            syns = re.split(r"\|", synonyms)
-            for synonym in syns:
-                if synonym != "-":
+            if synonyms.strip() != "-":
+                syns = self.SYNONYM_SPLITTER.split(synonyms)
+                for synonym in syns:
                     self.add_synonym(xref_id, synonym, xref_dbi)
                     syn_count += 1
 
-            seen[acc] = 1
-
-        file_io.close()
+            seen[acc] = True
 
-        result_message = f"{xref_count} EntrezGene Xrefs and {xref_count} WikiGene Xrefs added with {syn_count} synonyms"
-
-        return 0, result_message
+        return processed_count, syn_count
diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
index 9bcda9cbd..b8bca4e45 100644
--- a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
@@ -14,130 +14,157 @@
 
 """Parser module for HGNC source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
 
-from unidecode import unidecode
+from typing import Any, Dict, List, Tuple, Optional
+import csv
+import logging
+import re
+import requests
 import codecs
+from sqlalchemy import select
+from sqlalchemy.engine import Connection
+from sqlalchemy.engine.url import URL
+from unidecode import unidecode
 
+from ensembl.core.models import (
+    Transcript as TranscriptORM,
+    AttribType as AttribTypeORM,
+    TranscriptAttrib as TranscriptAttribORM,
+)
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class HGNCParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        dba        = args["dba"]
-        xref_dbi   = args["xref_dbi"]
-        verbose    = args.get("verbose", False)
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        dba = args.get("dba")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         # Parse the file string and set default user
-        file_params = self.parse_file_string(file)
-        if not file_params.get("user"):
-            file_params["user"] = "ensro"
-
-        # Prepare lookup lists
-        swissprot = self.get_valid_codes("Uniprot/SWISSPROT", species_id, xref_dbi)
-        refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
-        source_list = ["refseq_peptide", "refseq_mRNA"]
-        entrezgene = self.get_valid_xrefs_for_dependencies(
-            "EntrezGene", source_list, xref_dbi
-        )
+        file_params = self.parse_file_string(xref_file)
+        file_params.setdefault("user", "ensro")
 
         # Prepare sources
         self_source_name = self.get_source_name_for_source_id(source_id, xref_dbi)
         source_ids = {
-            "ccds": self.get_source_id_for_source_name(
-                self_source_name, xref_dbi, "ccds"
-            ),
-            "entrezgene_manual": self.get_source_id_for_source_name(
-                self_source_name, xref_dbi, "entrezgene_manual"
-            ),
-            "refseq_manual": self.get_source_id_for_source_name(
-                self_source_name, xref_dbi, "refseq_manual"
-            ),
-            "ensembl_manual": self.get_source_id_for_source_name(
-                self_source_name, xref_dbi, "ensembl_manual"
-            ),
-            "desc_only": self.get_source_id_for_source_name(
-                self_source_name, xref_dbi, "desc_only"
-            ),
+            "ccds": self.get_source_id_for_source_name(self_source_name, xref_dbi, "ccds"),
+            "entrezgene_manual": self.get_source_id_for_source_name(self_source_name, xref_dbi, "entrezgene_manual"),
+            "refseq_manual": self.get_source_id_for_source_name(self_source_name, xref_dbi, "refseq_manual"),
+            "ensembl_manual": self.get_source_id_for_source_name(self_source_name, xref_dbi, "ensembl_manual"),
+            "desc_only": self.get_source_id_for_source_name(self_source_name, xref_dbi, "desc_only"),
             "lrg": self.get_source_id_for_source_name("LRG_HGNC_notransfer", xref_dbi),
             "genecards": self.get_source_id_for_source_name("GeneCards", xref_dbi),
         }
 
         # Statistics counts
-        name_count = {
-            "ccds": 0,
-            "lrg": 0,
-            "ensembl_manual": 0,
-            "genecards": 0,
-            "refseq_manual": 0,
-            "entrezgene_manual": 0,
-        }
-        mismatch = 0
+        name_count = {key: 0 for key in source_ids}
 
         # Connect to the ccds db
-        ccds_db_url = None
-        if dba:
-            ccds_db_url = dba
-        elif file_params.get("host"):
-            ccds_db_url = URL.create(
-                "mysql",
-                file_params["user"],
-                file_params["pass"],
-                file_params["host"],
-                file_params["port"],
-                file_params["dbname"],
-            )
-        else:
-            raise AttributeError("No ensembl ccds database provided")
-
+        ccds_db_url = dba or self.construct_db_url(file_params)
         if not ccds_db_url:
             raise AttributeError("No ensembl ccds database provided")
-        else:
-            if verbose:
-                logging.info(f"Found ccds DB: {ccds_db_url}")
-
-        # Get CCDS data
-        db_engine = self.get_db_engine(ccds_db_url)
-        with db_engine.connect() as ccds_dbi:
-            query = (
-                select(TranscriptAttribORM.value, TranscriptORM.stable_id)
-                .join(
-                    TranscriptAttribORM,
-                    TranscriptORM.transcript_id == TranscriptAttribORM.transcript_id,
-                )
-                .join(
-                    AttribTypeORM,
-                    TranscriptAttribORM.attrib_type_id == AttribTypeORM.attrib_type_id,
-                )
-                .where(AttribTypeORM.code == "ccds_transcript")
-            )
-            result = ccds_dbi.execute(query).mappings().all()
-
-        ccds_to_ens = {}
-        for row in result:
-            # Remove version
-            ccds_id = re.sub(r"\.\d+", "", row.value)
-
-            ccds_to_ens[ccds_id] = row.stable_id
+        if verbose:
+            logging.info(f"Found ccds DB: {ccds_db_url}")
 
         # Get HGNC file (wget or disk)
-        mem_file = file
-        if file_params.get("wget"):
-            response = requests.get(file_params["wget"])
-            if not response.ok:
-                raise IOError(response.reason)
-            mem_file = response.text
+        mem_file = self.fetch_file(file_params, xref_file)
 
         # Make sure the file is utf8
         mem_file = codecs.encode(mem_file, "utf-8").decode("utf-8")
         mem_file = re.sub(r'"', '', mem_file)
 
-        file_io = self.get_filehandle(mem_file)
-        csv_reader = csv.DictReader(file_io, delimiter="\t")
+        with self.get_filehandle(mem_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"HGNC file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.DictReader(file_io, delimiter="\t")
+
+            syn_count = self.process_lines(csv_reader, source_ids, name_count, species_id, ccds_db_url, xref_dbi)
+
+        result_message = "HGNC xrefs loaded:\n"
+        for count_type, count in name_count.items():
+            if count_type == "desc_only": continue
+            result_message += f"\t{count_type}\t{count}\n"
+        result_message += f"{syn_count} synonyms added\n"
+        result_message += f"{name_count['desc_only']} HGNC ids could not be associated in xrefs"
+
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.DictReader, source_ids: Dict[str, int], name_count: Dict[str, int], species_id: int, ccds_db_url: str, xref_dbi: Connection) -> int:
+        # Prepare lookup lists
+        refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+        source_list = ["refseq_peptide", "refseq_mRNA"]
+        entrezgene = self.get_valid_xrefs_for_dependencies("EntrezGene", source_list, xref_dbi)
+
+        # Get CCDS data
+        ccds_to_ens = self.get_ccds_to_ens_mapping(ccds_db_url)
+
+        synonym_count = 0
+
+        # Helper function to add direct xrefs and synonyms
+        def add_direct_xref_and_synonyms(source_key: str, accession: str, symbol: str, feature_id: str, name: str, previous_symbols: str, synonyms: str) -> Tuple[int, int]:
+            xref_id = self.add_xref(
+                {
+                    "accession": accession,
+                    "label": symbol,
+                    "description": name,
+                    "source_id": source_ids[source_key],
+                    "species_id": species_id,
+                    "info_type": "DIRECT",
+                },
+                xref_dbi,
+            )
+            self.add_direct_xref(xref_id, feature_id, "gene", "", xref_dbi)
+            name_count[source_key] += 1
+
+            count = self.add_synonyms_for_hgnc(
+                {
+                    "source_id": source_ids[source_key],
+                    "name": accession,
+                    "species_id": species_id,
+                    "dead": previous_symbols,
+                    "alias": synonyms,
+                },
+                xref_dbi,
+            )
+
+            return xref_id, count
+
+        # Helper function to add dependent xrefs and synonyms
+        def add_dependent_xref_and_synonyms(source_key: str, master_xrefs: List[int], accession: str, symbol: str, name: str, previous_symbols: str, synonyms: str) -> int:
+            for xref_id in master_xrefs:
+                self.add_dependent_xref(
+                    {
+                        "master_xref_id": xref_id,
+                        "accession": accession,
+                        "label": symbol,
+                        "description": name,
+                        "source_id": source_ids[source_key],
+                        "species_id": species_id,
+                    },
+                    xref_dbi,
+                )
+                name_count[source_key] += 1
+
+            count = self.add_synonyms_for_hgnc(
+                {
+                    "source_id": source_ids[source_key],
+                    "name": accession,
+                    "species_id": species_id,
+                    "dead": previous_symbols,
+                    "alias": synonyms,
+                },
+                xref_dbi,
+            )
+
+            return count
 
         # Read lines
         for line in csv_reader:
@@ -147,198 +174,53 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
             previous_symbols = line["Previous symbols"]
             synonyms = line["Alias symbols"]
 
-            seen = 0
+            seen = False
 
             # Direct CCDS to ENST mappings
-            ccds = line["CCDS IDs"]
-            ccds_list = []
-            if ccds:
-                ccds_list = re.split(r",\s", ccds)
-
+            ccds_list = re.split(r",\s", line["CCDS IDs"]) if line["CCDS IDs"] else []
             for ccds in ccds_list:
                 enst_id = ccds_to_ens.get(ccds)
                 if not enst_id:
                     continue
 
-                self.add_to_direct_xrefs(
-                    {
-                        "stable_id": enst_id,
-                        "ensembl_type": "gene",
-                        "accession": accession,
-                        "label": symbol,
-                        "description": name,
-                        "source_id": source_ids["ccds"],
-                        "species_id": species_id,
-                    },
-                    xref_dbi,
-                )
-                self.add_synonyms_for_hgnc(
-                    {
-                        "source_id": source_ids["ccds"],
-                        "name": accession,
-                        "species_id": species_id,
-                        "dead": previous_symbols,
-                        "alias": synonyms,
-                    },
-                    xref_dbi,
-                )
-
-                name_count["ccds"] += 1
+                direct_xref_id, syn_count = add_direct_xref_and_synonyms("ccds", accession, symbol, enst_id, name, previous_symbols, synonyms)
+                synonym_count += syn_count
 
             # Direct LRG to ENST mappings
-            lrg_id = line["Locus specific databases"]
+            lrg_id = self.extract_lrg_id(line["Locus specific databases"])
             if lrg_id:
-                match = re.search(r"(LRG_\d+)\|", lrg_id)
-                if match:
-                    lrg_id = match.group(1)
-
-                    self.add_to_direct_xrefs(
-                        {
-                            "stable_id": lrg_id,
-                            "ensembl_type": "gene",
-                            "accession": accession,
-                            "label": symbol,
-                            "description": name,
-                            "source_id": source_ids["lrg"],
-                            "species_id": species_id,
-                        },
-                        xref_dbi,
-                    )
-                    self.add_synonyms_for_hgnc(
-                        {
-                            "source_id": source_ids["lrg"],
-                            "name": accession,
-                            "species_id": species_id,
-                            "dead": previous_symbols,
-                            "alias": synonyms,
-                        },
-                        xref_dbi,
-                    )
-
-                    name_count["lrg"] += 1
+                direct_xref_id, syn_count = add_direct_xref_and_synonyms("lrg", accession, symbol, lrg_id, name, previous_symbols, synonyms)
+                synonym_count += syn_count
 
             # Direct Ensembl mappings
             ensg_id = line["Ensembl gene ID"]
             if ensg_id:
-                seen = 1
-
-                self.add_to_direct_xrefs(
-                    {
-                        "stable_id": ensg_id,
-                        "ensembl_type": "gene",
-                        "accession": accession,
-                        "label": symbol,
-                        "description": name,
-                        "source_id": source_ids["ensembl_manual"],
-                        "species_id": species_id,
-                    },
-                    xref_dbi,
-                )
-                self.add_synonyms_for_hgnc(
-                    {
-                        "source_id": source_ids["ensembl_manual"],
-                        "name": accession,
-                        "species_id": species_id,
-                        "dead": previous_symbols,
-                        "alias": synonyms,
-                    },
-                    xref_dbi,
-                )
+                seen = True
 
-                name_count["ensembl_manual"] += 1
+                direct_xref_id, syn_count = add_direct_xref_and_synonyms("ensembl_manual", accession, symbol, ensg_id, name, previous_symbols, synonyms)
+                synonym_count += syn_count
 
                 # GeneCards
-                direct_id = self.get_xref_id(
-                    accession, source_ids["ensembl_manual"], species_id, xref_dbi
-                )
                 hgnc_id = re.search(r"HGNC:(\d+)", accession).group(1)
-
-                self.add_dependent_xref(
-                    {
-                        "master_xref_id": direct_id,
-                        "accession": hgnc_id,
-                        "label": symbol,
-                        "description": name,
-                        "source_id": source_ids["genecards"],
-                        "species_id": species_id,
-                    },
-                    xref_dbi,
-                )
-                self.add_synonyms_for_hgnc(
-                    {
-                        "source_id": source_ids["genecards"],
-                        "name": hgnc_id,
-                        "species_id": species_id,
-                        "dead": previous_symbols,
-                        "alias": synonyms,
-                    },
-                    xref_dbi,
-                )
-
-                name_count["genecards"] += 1
+                synonym_count += add_dependent_xref_and_synonyms("genecards", [direct_xref_id], hgnc_id, symbol, name, previous_symbols, synonyms)
 
             # RefSeq
             refseq_id = line["RefSeq IDs"]
             if refseq_id and refseq.get(refseq_id):
-                seen = 1
-
-                for xref_id in refseq[refseq_id]:
-                    self.add_dependent_xref(
-                        {
-                            "master_xref_id": xref_id,
-                            "accession": accession,
-                            "label": symbol,
-                            "description": name,
-                            "source_id": source_ids["refseq_manual"],
-                            "species_id": species_id,
-                        },
-                        xref_dbi,
-                    )
-                    name_count["refseq_manual"] += 1
-
-                self.add_synonyms_for_hgnc(
-                    {
-                        "source_id": source_ids["refseq_manual"],
-                        "name": accession,
-                        "species_id": species_id,
-                        "dead": previous_symbols,
-                        "alias": synonyms,
-                    },
-                    xref_dbi,
-                )
+                seen = True
+
+                synonym_count += add_dependent_xref_and_synonyms("refseq_manual", refseq[refseq_id], accession, symbol, name, previous_symbols, synonyms)
 
             # EntrezGene
             entrez_id = line["NCBI Gene ID"]
             if entrez_id and entrezgene.get(entrez_id):
-                seen = 1
-
-                self.add_dependent_xref(
-                    {
-                        "master_xref_id": entrezgene[entrez_id],
-                        "accession": accession,
-                        "label": symbol,
-                        "description": name,
-                        "source_id": source_ids["entrezgene_manual"],
-                        "species_id": species_id,
-                    },
-                    xref_dbi,
-                )
-                self.add_synonyms_for_hgnc(
-                    {
-                        "source_id": source_ids["entrezgene_manual"],
-                        "name": accession,
-                        "species_id": species_id,
-                        "dead": previous_symbols,
-                        "alias": synonyms,
-                    },
-                    xref_dbi,
-                )
+                seen = True
 
-                name_count["entrezgene_manual"] += 1
+                synonym_count += add_dependent_xref_and_synonyms("entrezgene_manual", [entrezgene[entrez_id]], accession, symbol, name, previous_symbols, synonyms)
 
             # Store to keep descriptions if not stored yet
             if not seen:
-                xref_id = self.add_xref(
+                self.add_xref(
                     {
                         "accession": accession,
                         "label": symbol,
@@ -349,7 +231,9 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                     },
                     xref_dbi,
                 )
-                self.add_synonyms_for_hgnc(
+                name_count["desc_only"] += 1
+                
+                synonym_count += self.add_synonyms_for_hgnc(
                     {
                         "source_id": source_ids["desc_only"],
                         "name": accession,
@@ -359,24 +243,18 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                     },
                     xref_dbi,
                 )
-                mismatch += 1
 
-        file_io.close()
+        return synonym_count
 
-        result_message = "HGNC xrefs loaded:\n"
-        for count_type, count in name_count.items():
-            result_message += f"\t{count_type}\t{count}\n"
-        result_message += f"{mismatch} HGNC ids could not be associated in xrefs"
-
-        return 0, result_message
-
-    def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None:
-        source_id    = args["source_id"]
-        name         = args["name"]
-        species_id   = args["species_id"]
-        dead_string  = args.get("dead")
+    def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> int:
+        source_id = args["source_id"]
+        name = args["name"]
+        species_id = args["species_id"]
+        dead_string = args.get("dead")
         alias_string = args.get("alias")
 
+        syn_count = 0
+
         # Dead name, add to synonym
         if dead_string:
             dead_string = re.sub('"', "", dead_string)
@@ -388,8 +266,8 @@ def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None:
                 except:
                     pass
                 dead = unidecode(dead.upper())
-
                 self.add_to_syn(name, source_id, dead, species_id, dbi)
+                syn_count += 1
 
         # Alias name, add to synonym
         if alias_string:
@@ -402,20 +280,72 @@ def add_synonyms_for_hgnc(self, args: Dict[str, Any], dbi: Connection) -> None:
                 except:
                     pass
                 alias = unidecode(alias.upper())
-
                 self.add_to_syn(name, source_id, alias, species_id, dbi)
+                syn_count += 1
+        
+        return syn_count
 
     def parse_file_string(self, file_string: str) -> Dict[str, str]:
-        # file_string = re.sub(r"\A\w+:", "", file_string)
         file_string = re.sub(r"^\w+:", "", file_string)
-
         param_pairs = file_string.split(",")
         params = {}
 
         # Set provided values
         for pair in param_pairs:
-            if re.search("=>", pair):
+            if "=>" in pair:
                 key, value = pair.split("=>")
                 params[key] = value
 
         return params
+
+    def construct_db_url(self, file_params: Dict[str, str]) -> Optional[URL]:
+        if file_params.get("host"):
+            return URL.create(
+                "mysql",
+                file_params["user"],
+                file_params["pass"],
+                file_params["host"],
+                file_params["port"],
+                file_params["dbname"],
+            )
+        return None
+
+    def get_ccds_to_ens_mapping(self, ccds_url: str) -> Dict[str, str]:
+        db_engine = self.get_db_engine(ccds_url)
+        with db_engine.connect() as ccds_dbi:
+            query = (
+                select(TranscriptAttribORM.value, TranscriptORM.stable_id)
+                .join(
+                    TranscriptAttribORM,
+                    TranscriptORM.transcript_id == TranscriptAttribORM.transcript_id,
+                )
+                .join(
+                    AttribTypeORM,
+                    TranscriptAttribORM.attrib_type_id == AttribTypeORM.attrib_type_id,
+                )
+                .where(AttribTypeORM.code == "ccds_transcript")
+            )
+            result = ccds_dbi.execute(query).mappings().all()
+
+        ccds_to_ens = {}
+        for row in result:
+            ccds_id = re.sub(r"\.\d+", "", row.value) # Remove version
+            
+            ccds_to_ens[ccds_id] = row.stable_id
+
+        return ccds_to_ens
+
+    def fetch_file(self, file_params: Dict[str, str], file: str) -> str:
+        if file_params.get("wget"):
+            response = requests.get(file_params["wget"])
+            if not response.ok:
+                raise IOError(response.reason)
+            return response.text
+        return file
+
+    def extract_lrg_id(self, lrg_id: str) -> Optional[str]:
+        if lrg_id:
+            match = re.search(r"(LRG_\d+)\|", lrg_id)
+            if match:
+                return match.group(1)
+        return None
diff --git a/src/python/ensembl/production/xrefs/parsers/HPAParser.py b/src/python/ensembl/production/xrefs/parsers/HPAParser.py
index 76c99d769..f1268f047 100644
--- a/src/python/ensembl/production/xrefs/parsers/HPAParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/HPAParser.py
@@ -15,60 +15,64 @@
 """Parser module for HPA source."""
 
 import csv
+from typing import Dict, Any, Tuple
+from sqlalchemy.engine import Connection
 
 from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
-EXPECTED_NUMBER_OF_COLUMNS = 4
-
-
 class HPAParser(BaseParser):
+    EXPECTED_NUMBER_OF_COLUMNS = 4
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError("HPA file is empty")
+            file_io.seek(0)
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+            csv_reader = csv.reader(file_io, delimiter=",", strict=True)
+            header = next(csv_reader)
+            patterns = [r"^antibody$", r"^antibody_id$", r"^ensembl_peptide_id$", r"^link$"]
+            if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+                raise ValueError(f"Malformed or unexpected header in HPA file {xref_file}")
 
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(file_io, delimiter=",", strict=True)
+            parsed_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi)
 
-        # Check if header is valid
-        header = next(csv_reader)
-        patterns = ["antibody", "antibody_id", "ensembl_peptide_id", "link"]
-        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
-            raise IOError(f"Malformed or unexpected header in HPA file {file}")
+        result_message = f"{parsed_count} direct xrefs successfully parsed"
+        return 0, result_message
 
+    def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection) -> int:
         parsed_count = 0
 
-        # Read lines
         for line in csv_reader:
             if not line:
                 continue
 
-            antibody_name = line[0]
-            antibody_id = line[1]
-            ensembl_id = line[2]
+            if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS:
+                raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns")
+
+            antibody_name, antibody_id, ensembl_id = line[:3]
 
-            self.add_to_direct_xrefs(
+            xref_id = self.add_xref(
                 {
                     "accession": antibody_id,
                     "version": "1",
                     "label": antibody_name,
-                    "stable_id": ensembl_id,
-                    "ensembl_type": "translation",
                     "source_id": source_id,
                     "species_id": species_id,
                     "info_type": "DIRECT",
                 },
                 xref_dbi,
             )
+            self.add_direct_xref(xref_id, ensembl_id, "translation", "", xref_dbi)
 
             parsed_count += 1
 
-        file_io.close()
-
-        result_message = f"{parsed_count} direct xrefs succesfully parsed"
-
-        return 0, result_message
+        return parsed_count
diff --git a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
index 8ce883d1d..94dc7466a 100644
--- a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
@@ -14,47 +14,54 @@
 
 """Parser module for JGI source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-
+import re
 from Bio import SeqIO
+from typing import Dict, Any, Tuple
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class JGI_ProteinParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         xrefs = []
 
-        file_io = self.get_filehandle(file)
-        fasta_sequences = SeqIO.parse(file_io, "fasta")
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"JGIProtein file is empty")
+            file_io.seek(0)
 
-        for fasta in fasta_sequences:
-            accession = fasta.id
-            sequence = fasta.seq
+            fasta_sequences = SeqIO.parse(file_io, "fasta")
 
-            # Extract accession value
-            accession = re.search(r"^ci0100(\w+?)$", accession).group(1)
+            for fasta in fasta_sequences:
+                accession = fasta.id
+                sequence = str(fasta.seq)
 
-            # Build an xref object and store it
-            xref = {
-                "ACCESSION": accession,
-                "SEQUENCE": sequence,
-                "SOURCE_ID": source_id,
-                "SPECIES_ID": species_id,
-                "SEQUENCE_TYPE": "peptide",
-            }
-            xrefs.append(xref)
+                # Extract accession value
+                match = re.search(r"^ci0100(\w+?)$", accession)
+                if not match:
+                    continue
+                accession = match.group(1)
 
-        file_io.close()
+                # Build an xref object and store it
+                xref = {
+                    "ACCESSION": accession,
+                    "SEQUENCE": sequence,
+                    "SOURCE_ID": source_id,
+                    "SPECIES_ID": species_id,
+                    "SEQUENCE_TYPE": "peptide",
+                    "INFO_TYPE": "SEQUENCE_MATCH",
+                }
+                xrefs.append(xref)
 
         self.upload_xref_object_graphs(xrefs, xref_dbi)
 
-        result_message = "%d JGI_ xrefs succesfully parsed" % len(xrefs)
+        result_message = f"{len(xrefs)} JGI_ xrefs successfully parsed"
 
         return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MGIDescParser.py b/src/python/ensembl/production/xrefs/parsers/MGIDescParser.py
new file mode 100644
index 000000000..a348c7227
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/MGIDescParser.py
@@ -0,0 +1,107 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for MGI Descriptions."""
+
+import csv
+import logging
+import re
+from typing import Any, Dict, Tuple
+from sqlalchemy.engine import Connection
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
+
+class MGIDescParser(BaseParser):
+    EXPECTED_NUMBER_OF_COLUMNS = 12
+    SYNONYM_SPLITTER = re.compile(r"[|]")
+
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError("MGI_desc file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.reader(file_io, delimiter="\t", strict=True, quotechar=None, escapechar=None)
+            header = next(csv_reader)
+            patterns = [
+                r"^mgi accession id$",
+                r"^chr$",
+                r"^cm position$",
+                r"^genome coordinate start$",
+                r"^genome coordinate end$",
+                r"^strand$",
+                r"^marker symbol$",
+                r"^status$",
+                r"^marker name$",
+                r"^marker type$",
+                r"^feature type$",
+                r"^marker synonyms \(pipe-separated\)$",
+            ]
+            if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
+                raise ValueError(f"Malformed or unexpected header in MGI_desc file {xref_file}")
+
+            xref_count, syn_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi, verbose)
+
+        result_message = f"{xref_count} MGI Description Xrefs added\n{syn_count} synonyms added"
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection, verbose: bool) -> Tuple[int, int]:
+        xref_count = 0
+        syn_count = 0
+
+        for line in csv_reader:
+            if not line:
+                continue
+
+            if len(line) < self.EXPECTED_NUMBER_OF_COLUMNS:
+                raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns")
+
+            accession = line[0]
+            label = line[6]
+            marker = line[8]
+            synonym_field = line[11]
+
+            xref_id = self.add_xref(
+                {
+                    "accession": accession,
+                    "label": label,
+                    "description": marker,
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "info_type": "MISC",
+                },
+                xref_dbi,
+            )
+
+            if not marker and verbose:
+                logging.info(f"{accession} has no description")
+
+            xref_count += 1
+
+            if synonym_field:
+                synonyms = self.SYNONYM_SPLITTER.split(synonym_field)
+                for synonym in synonyms:
+                    self.add_synonym(xref_id, synonym, xref_dbi)
+                    syn_count += 1
+
+        return xref_count, syn_count
diff --git a/src/python/ensembl/production/xrefs/parsers/MGIParser.py b/src/python/ensembl/production/xrefs/parsers/MGIParser.py
index 2508d516a..4d95de9e8 100644
--- a/src/python/ensembl/production/xrefs/parsers/MGIParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/MGIParser.py
@@ -14,59 +14,68 @@
 
 """Parser module for MGI source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import csv
+from typing import Dict, Any, Tuple, List
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class MGIParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
-        syn_hash = self.get_ext_synonyms("MGI", xref_dbi)
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError("MGI file is empty")
+            file_io.seek(0)
 
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(file_io, delimiter="\t", strict=True)
+            csv_reader = csv.reader(file_io, delimiter="\t", strict=True)
 
+            syn_hash = self.get_ext_synonyms("MGI", xref_dbi)
+            count, syn_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi, syn_hash)
+
+        result_message = f"{count} direct MGI xrefs added\n{syn_count} synonyms added"
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection, syn_hash: Dict[str, List[str]]) -> Tuple[int, int]:
         count = 0
         syn_count = 0
 
-        # Read lines
         for line in csv_reader:
             if not line:
                 continue
 
             accession = line[0]
+            label = line[1]
+            description = line[2]
             ensembl_id = line[5]
 
             xref_id = self.add_xref(
                 {
                     "accession": accession,
                     "version": 0,
-                    "label": line[1],
-                    "description": line[2],
+                    "label": label,
+                    "description": description,
                     "source_id": source_id,
                     "species_id": species_id,
                     "info_type": "DIRECT",
                 },
                 xref_dbi,
             )
-            self.add_direct_xref(xref_id, ensembl_id, "Gene", "", xref_dbi)
+            self.add_direct_xref(xref_id, ensembl_id, "gene", "", xref_dbi)
 
-            if syn_hash.get(accession):
-                for synonym in syn_hash[accession]:
+            synonyms = syn_hash.get(accession)
+            if synonyms:
+                for synonym in synonyms:
                     self.add_synonym(xref_id, synonym, xref_dbi)
                     syn_count += 1
 
             count += 1
 
-        file_io.close()
-
-        result_message = f"{count} direct MGI xrefs added\n"
-        result_message += f"{syn_count} synonyms added"
-
-        return 0, result_message
+        return count, syn_count
diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py
deleted file mode 100644
index ae1fbb3dd..000000000
--- a/src/python/ensembl/production/xrefs/parsers/MGI_CCDS_Parser.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#  See the NOTICE file distributed with this work for additional information
-#  regarding copyright ownership.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-"""Parser module for MGI CCDS source."""
-
-from ensembl.production.xrefs.parsers.BaseParser import *
-
-
-class MGI_CCDS_Parser(BaseParser):
-    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
-
-        source_ids = []
-        labels = {}
-        versions = {}
-        descriptions = {}
-        accessions = {}
-
-        query = select(SourceUORM.source_id).filter(SourceUORM.name.like("MGI"))
-        result = xref_dbi.execute(query).fetchall()
-        for row in result:
-            source_ids.append(row[0])
-
-        query = select(
-            XrefUORM.accession, XrefUORM.label, XrefUORM.version, XrefUORM.description
-        ).filter(XrefUORM.source_id.in_(source_ids))
-
-        for row in xref_dbi.execute(query).mappings().all():
-            if row["description"]:
-                accessions[row["label"]] = row.accession
-                labels[row["accession"]] = row.label
-                versions[row["accession"]] = row.version
-                descriptions[row["accession"]] = row.description
-
-        # Get master xref ids via the ccds label
-        ccds_label_to_xref_id = {}
-        query = select(XrefUORM.label, XrefUORM.xref_id).where(
-            XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name == "CCDS"
-        )
-        result = xref_dbi.execute(query).fetchall()
-        for row in result:
-            ccds_label_to_xref_id[row[0]] = row[1]
-
-        count = 0
-        ccds_missing = 0
-        mgi_missing = 0
-
-        mgi_io = self.get_filehandle(file)
-        for line in mgi_io:
-            line = line.rstrip()
-            if not line:
-                continue
-
-            fields = line.split("\t")
-            chromosome = fields[0]
-            g_accession = fields[1]
-            gene_name = fields[2]
-            entrez_id = fields[3]
-            ccds = fields[4]
-
-            if ccds_label_to_xref_id.get(ccds):
-                if accessions.get(gene_name) and labels.get(accessions[gene_name]):
-                    accession = accessions[gene_name]
-                    self.add_dependent_xref(
-                        {
-                            "master_xref_id": ccds_label_to_xref_id[ccds],
-                            "accession": accession,
-                            "version": versions[accession],
-                            "label": labels[accession],
-                            "description": descriptions[accession],
-                            "source_id": source_id,
-                            "species_id": species_id,
-                        },
-                        xref_dbi,
-                    )
-
-                    count += 1
-                else:
-                    mgi_missing += 1
-            else:
-                ccds_missing += 1
-
-        mgi_io.close()
-
-        result_message = f"Added {count} MGI xrefs via CCDS\n"
-        result_message += (
-            f"{ccds_missing} CCDS not resolved, {mgi_missing} MGI not found"
-        )
-
-        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py b/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py
deleted file mode 100644
index 010298200..000000000
--- a/src/python/ensembl/production/xrefs/parsers/MGI_Desc_Parser.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#  See the NOTICE file distributed with this work for additional information
-#  regarding copyright ownership.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-"""Parser module for MGI Descriptions."""
-
-from ensembl.production.xrefs.parsers.BaseParser import *
-
-EXPECTED_NUMBER_OF_COLUMNS = 12
-
-
-class MGI_Desc_Parser(BaseParser):
-    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
-        verbose    = args.get("verbose", False)
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
-
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(
-            file_io, delimiter="\t", strict=True, quotechar=None, escapechar=None
-        )
-
-        # Check if header is valid
-        header = next(csv_reader)
-        patterns = [
-            "mgi accession id",
-            "chr",
-            "cm position",
-            "genome coordinate start",
-            "genome coordinate end",
-            "strand",
-            "marker symbol",
-            "status",
-            "marker name",
-            "marker type",
-            "feature type",
-            r"marker\ssynonyms\s\(pipe\-separated\)",
-        ]
-        if not self.is_file_header_valid(EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
-            raise IOError(f"Malformed or unexpected header in MGI_desc file {file}")
-
-        xref_count = 0
-        syn_count = 0
-        acc_to_xref = {}
-
-        # Read lines
-        for line in csv_reader:
-            if not line:
-                continue
-
-            accession = line[0]
-            marker = line[8]
-
-            xref_id = self.add_xref(
-                {
-                    "accession": accession,
-                    "label": line[6],
-                    "description": marker,
-                    "source_id": source_id,
-                    "species_id": species_id,
-                    "info_type": "MISC",
-                },
-                xref_dbi,
-            )
-            acc_to_xref[accession] = xref_id
-
-            if not marker and verbose:
-                logging.info(f"{accession} has no description")
-
-            xref_count += 1
-
-            if acc_to_xref.get(accession):
-                synonym_field = line[11]
-                if synonym_field:
-                    synonyms = re.split(r"[|]", synonym_field)
-
-                    for synonym in synonyms:
-                        self.add_synonym(xref_id, synonym, xref_dbi)
-                        syn_count += 1
-
-        file_io.close()
-
-        result_message = f"{xref_count} MGI Description Xrefs added\n"
-        result_message += f"{syn_count} synonyms added"
-
-        return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/MIMParser.py b/src/python/ensembl/production/xrefs/parsers/MIMParser.py
index 1ae4f5952..829c5de21 100644
--- a/src/python/ensembl/production/xrefs/parsers/MIMParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/MIMParser.py
@@ -14,29 +14,30 @@
 
 """Parser module for MIM source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import re
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class MIMParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        general_source_id = args["source_id"]
-        species_id        = args["species_id"]
-        file              = args["file"]
-        xref_dbi          = args["xref_dbi"]
-        verbose           = args.get("verbose", False)
-
-        if not general_source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
-
+        general_source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not general_source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+    
         old_to_new, removed = {}, {}
-        sources = []
-
-        sources.append(general_source_id)
+        sources = [general_source_id]
 
         gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi)
-        sources.append(gene_source_id)
         morbid_source_id = self.get_source_id_for_source_name("MIM_MORBID", xref_dbi)
-        sources.append(morbid_source_id)
+        sources.extend([gene_source_id, morbid_source_id])
 
         TYPE_SINGLE_SOURCES = {
             "*": gene_source_id,
@@ -48,30 +49,24 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         counters = {gene_source_id: 0, morbid_source_id: 0, "removed": 0, "synonyms": 0}
 
         if verbose:
-            logging.info("Sources are: " + ", ".join(map(str, sources)))
-
-        for section in self.get_file_sections(file, "*RECORD*"):
-            if len(section) == 1:
-                continue
+            logging.info(f"Sources are: {', '.join(map(str, sources))}")
 
+        for section in self.get_file_sections(xref_file, "*RECORD*"):
             record = "".join(section)
 
-            # Extract the TI field
+            # Extract the TI field from the record
             ti = self.extract_ti(record)
             if not ti:
-                raise IOError("Failed to extract TI field from record")
+                raise ValueError("Failed to extract TI field from record")
 
-            # Extract record type
-            (record_type, number, long_desc) = self.parse_ti(ti)
+            # Extract record type, number, and description from the TI field
+            record_type, number, long_desc = self.parse_ti(ti)
             if record_type is None:
-                raise IOError(
-                    "Failed to extract record type and description from TI field"
-                )
+                raise ValueError("Failed to extract record type and description from TI field")
 
             # Use the first block of text as description
             fields = re.split(";;", long_desc, flags=re.MULTILINE | re.DOTALL)
-            label = fields[0]
-            label = f"{label} [{record_type}{number}]"
+            label = f"{fields[0]} [{record_type}{number}]"
 
             xref_object = {
                 "accession": number,
@@ -81,79 +76,72 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 "info_type": "UNMAPPED",
             }
 
-            if TYPE_SINGLE_SOURCES.get(record_type):
+            if record_type in TYPE_SINGLE_SOURCES:
                 type_source = TYPE_SINGLE_SOURCES[record_type]
                 xref_object["source_id"] = type_source
                 counters[type_source] += 1
-
-                xref_id = self.add_xref(xref_object, xref_dbi)
+                self.add_xref(xref_object, xref_dbi)
             elif record_type == "+":
                 # This type means both gene and phenotype, add both
                 xref_object["source_id"] = gene_source_id
                 counters[gene_source_id] += 1
-                xref_id = self.add_xref(xref_object, xref_dbi)
+                self.add_xref(xref_object, xref_dbi)
 
                 xref_object["source_id"] = morbid_source_id
                 counters[morbid_source_id] += 1
-                xref_id = self.add_xref(xref_object, xref_dbi)
+                self.add_xref(xref_object, xref_dbi)
             elif record_type == "^":
-                match = re.search(
-                    r"MOVED\sTO\s(\d+)", long_desc, flags=re.MULTILINE | re.DOTALL
-                )
-                if match:
-                    new_number = match.group(1)
-                    if new_number != number:
-                        old_to_new[number] = new_number
-                elif long_desc == "REMOVED FROM DATABASE":
-                    removed[number] = 1
-                    counters["removed"] += 1
-                else:
-                    raise IOError(f"Unsupported type of a '^' record: '{long_desc}'")
-
-        # Generate synonyms from "MOVED TO" entries
-        for old, new in old_to_new.items():
-            # Some entries in the MIM database have been moved multiple times
-            # Keep traversing the chain of renames until we have reached the end
-            while old_to_new.get(new):
-                new = old_to_new[new]
+                self.handle_moved_or_removed_record(number, long_desc, old_to_new, removed, counters)
 
-            # Check if the entry has been removed from the database
-            if not removed.get(new):
-                self.add_to_syn_for_mult_sources(
-                    new, sources, old, species_id, xref_dbi
-                )
-                counters["synonyms"] += 1
+        self.generate_synonyms_from_moved_entries(old_to_new, removed, sources, species_id, xref_dbi, counters)
 
-        result_message = "%d genemap and %d phenotype MIM xrefs added\n" % (
-            counters[gene_source_id],
-            counters[morbid_source_id],
+        result_message = (
+            f"{counters[gene_source_id]} genemap and {counters[morbid_source_id]} phenotype MIM xrefs added\n"
+            f"\t{counters['synonyms']} synonyms (defined by MOVED TO) added\n"
+            f"\t{counters['removed']} entries removed"
         )
-        result_message += (
-            "\t%d synonyms (defined by MOVED TO) added\n" % counters["synonyms"]
-        )
-        result_message += "\t%d entries removed" % counters["removed"]
 
         return 0, result_message
 
-    def extract_ti(self, input_record: str) -> str:
-        ti = None
-
+    def extract_ti(self, input_record: str) -> Optional[str]:
+        """Extract the TI field from the input record."""
         match = re.search(
             r"[*]FIELD[*]\sTI\n(.+?)\n?(?:[*]FIELD[*]| [*]RECORD[*]| [*]THEEND[*])",
             input_record,
             flags=re.MULTILINE | re.DOTALL,
         )
-        if match:
-            ti = match.group(1)
-
-        return ti
+        return match.group(1) if match else None
 
     def parse_ti(self, ti: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """Parse the TI field to extract record type, number, and description."""
         ti = re.sub(r"(?:;;\n|\n;;)", ";;", ti, flags=re.MULTILINE | re.DOTALL)
         ti = re.sub(r"\n", "", ti, flags=re.MULTILINE | re.DOTALL)
 
         match = re.search(r"\A([#%+*^]*)(\d+)\s+(.+)", ti)
+        return match.groups() if match else (None, None, None)
+
+    def handle_moved_or_removed_record(self, number: str, long_desc: str, old_to_new: Dict[str, str], removed: Dict[str, int], counters: Dict[str, int]) -> None:
+        """Handle records that have been moved or removed."""
+        match = re.search(r"MOVED\sTO\s(\d+)", long_desc, flags=re.MULTILINE | re.DOTALL)
         if match:
-            return match.group(1), match.group(2), match.group(3)
+            new_number = match.group(1)
+            if new_number != number:
+                old_to_new[number] = new_number
+        elif long_desc == "REMOVED FROM DATABASE":
+            removed[number] = 1
+            counters["removed"] += 1
+        else:
+            raise IOError(f"Unsupported type of a '^' record: '{long_desc}'")
+
+    def generate_synonyms_from_moved_entries(self, old_to_new: Dict[str, str], removed: Dict[str, int], sources: List[int], species_id: int, xref_dbi: Connection, counters: Dict[str, int]) -> None:
+        """Generate synonyms from 'MOVED TO' entries."""
+        for old, new in old_to_new.items():
+            # Some entries in the MIM database have been moved multiple times
+            # Keep traversing the chain of renames until we have reached the end
+            while old_to_new.get(new):
+                new = old_to_new[new]
 
-        return None, None, None
+            # Check if the entry has been removed from the database
+            if not removed.get(new):
+                self.add_to_syn_for_mult_sources(new, sources, old, species_id, xref_dbi)
+                counters["synonyms"] += 1
diff --git a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
index 6c7688889..4a1654fc5 100644
--- a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
@@ -14,27 +14,57 @@
 
 """Parser module for MIM to Gene source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-
-EXPECTED_NUMBER_OF_COLUMNS = 6
+import csv
+import re
+import logging
+from typing import Any, Dict, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class Mim2GeneParser(BaseParser):
+    EXPECTED_NUMBER_OF_COLUMNS = 6
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        general_source_id = args["source_id"]
-        species_id        = args["species_id"]
-        file              = args["file"]
-        xref_dbi          = args["xref_dbi"]
-        verbose           = args.get("verbose", False)
+        general_source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not general_source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        counters = {
+            "all_entries": 0,
+            "dependent_on_entrez": 0,
+            "missed_master": 0,
+            "missed_omim": 0,
+        }
+
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"Mim2Gene file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.reader(file_io, delimiter="\t")
 
-        if not general_source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+            self.process_lines(csv_reader, xref_file, species_id, counters, verbose, xref_dbi) 
 
+        result_message = (
+            f"Processed {counters['all_entries']} entries. Out of those\n"
+            f"\t{counters['missed_omim']} had missing OMIM entries,\n"
+            f"\t{counters['dependent_on_entrez']} were dependent EntrezGene xrefs,\n"
+            f"\t{counters['missed_master']} had missing master entries."
+        )
+        # result_message = f"all={counters['all_entries']} -- missed_omim={counters['missed_omim']} -- dependent_on_entrez={counters['dependent_on_entrez']} -- missed_master={counters['missed_master']}"
+
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.reader, xref_file:str, species_id: int, counters: Dict[str, int], verbose: bool, xref_dbi: Connection) -> None:
         # Get needed source IDs
         mim_gene_source_id = self.get_source_id_for_source_name("MIM_GENE", xref_dbi)
-        mim_morbid_source_id = self.get_source_id_for_source_name(
-            "MIM_MORBID", xref_dbi
-        )
+        mim_morbid_source_id = self.get_source_id_for_source_name("MIM_MORBID", xref_dbi)
         entrez_source_id = self.get_source_id_for_source_name("EntrezGene", xref_dbi)
 
         # This will be used to prevent insertion of duplicates
@@ -45,16 +75,6 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         mim_morbid = self.get_valid_codes("MIM_MORBID", species_id, xref_dbi)
         entrez = self.get_valid_codes("EntrezGene", species_id, xref_dbi)
 
-        counters = {
-            "all_entries": 0,
-            "dependent_on_entrez": 0,
-            "missed_master": 0,
-            "missed_omim": 0,
-        }
-
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(file_io, delimiter="\t")
-
         # Read lines
         for line in csv_reader:
             if not line:
@@ -67,34 +87,21 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 if is_comment:
                     patterns = [
                         r"\A[#]?\s*MIM[ ]number",
-                        "GeneID",
-                        "type",
-                        "Source",
-                        "MedGenCUI",
-                        "Comment",
+                        r"^GeneID$",
+                        r"^type$",
+                        r"^Source$",
+                        r"^MedGenCUI$",
+                        r"^Comment$",
                     ]
-                    if len(
-                        line
-                    ) == EXPECTED_NUMBER_OF_COLUMNS and not self.is_file_header_valid(
-                        EXPECTED_NUMBER_OF_COLUMNS, patterns, line, True
-                    ):
-                        raise IOError(
-                            f"Malformed or unexpected header in Mim2Gene file {file}"
-                        )
+                    if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, line, True):
+                        raise ValueError(f"Malformed or unexpected header in Mim2Gene file {xref_file}")
                     continue
 
-            if len(line) != EXPECTED_NUMBER_OF_COLUMNS:
-                raise IOError(
-                    f"Line {csv_reader.line_num} of input file {file} has an incorrect number of columns"
-                )
+            if len(line) != self.EXPECTED_NUMBER_OF_COLUMNS:
+                raise ValueError(f"Line {csv_reader.line_num} of input file has an incorrect number of columns")
 
             fields = [re.sub(r"\s+\Z", "", x) for x in line]
-            omim_acc = fields[0]
-            entrez_id = fields[1]
-            type = fields[2]
-            source = fields[3]
-            medgen = fields[4]
-            comment = fields[5]
+            omim_acc, entrez_id, type = fields[:3]
 
             counters["all_entries"] += 1
 
@@ -109,15 +116,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 continue
 
             # Check if type is known
-            if verbose and type not in [
-                "gene",
-                "gene/phenotype",
-                "predominantly phenotypes",
-                "phenotype",
-            ]:
-                logging.warn(
-                    f"Unknown type {type} for MIM Number {omim_acc} ({file}:{csv_reader.line_num})"
-                )
+            if verbose and type not in ["gene", "gene/phenotype", "predominantly phenotypes", "phenotype"]:
+                logging.warning(f"Unknown type {type} for MIM Number {omim_acc} ({xref_file}:{csv_reader.line_num})")
 
             # With all the checks taken care of, insert the mappings. We check
             # both MIM_GENE and MIM_MORBID every time because some MIM entries
@@ -145,26 +145,11 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                         xref_dbi,
                     )
 
-        file_io.close()
-
-        result_message = (
-            "Processed %d entries. Out of those\n" % counters["all_entries"]
-        )
-        result_message += "\t%d had missing OMIM entries,\n" % counters["missed_omim"]
-        result_message += (
-            "\t%d were dependent EntrezGene xrefs,\n" % counters["dependent_on_entrez"]
-        )
-        result_message += "\t%d had missing master entries." % counters["missed_master"]
-
-        return 0, result_message
-
     def process_xref_entry(self, args: Dict[str, Any], dbi: Connection) -> int:
         count = 0
-
         for ent_id in args["entrez_xrefs"]:
             self.add_dependent_xref_maponly(
                 args["mim_xref_id"], args["mim_source_id"], ent_id, None, dbi, True
             )
             count += 1
-
         return count
diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
index c7d4990eb..e760cbf9e 100644
--- a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
@@ -14,33 +14,50 @@
 
 """Parser module for RFAM source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-
+import logging
+import os
+import re
+import wget
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+from sqlalchemy import and_, select
+from sqlalchemy.engine import Connection
+from sqlalchemy.engine.url import URL
+
+from ensembl.core.models import (
+    Analysis as AnalysisORM,
+    Transcript as TranscriptORM,
+    ExonTranscript as ExonTranscriptORM,
+    SupportingFeature as SupportingFeatureORM,
+    DnaAlignFeature as DnaAlignFeatureORM,
+)
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class RFAMParser(BaseParser):
+    ACCESSION_PATTERN = re.compile(r"^#=GF\sAC\s+(\w+)", re.MULTILINE)
+    LABEL_PATTERN = re.compile(r"^#=GF\sID\s+([^\n]+)", re.MULTILINE)
+    DESCRIPTION_PATTERN = re.compile(r"^#=GF\sDE\s+([^\n]+)", re.MULTILINE)
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id       = args["source_id"]
-        species_id      = args["species_id"]
-        species_name    = args["species_name"]
-        file            = args["file"]
-        dba             = args["dba"]
-        ensembl_release = args["ensembl_release"]
-        xref_dbi        = args["xref_dbi"]
-        verbose         = args.get("verbose", False)
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        species_name = args.get("species_name")
+        xref_file = args.get("file")
+        dba = args.get("dba")
+        ensembl_release = args.get("ensembl_release")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         # Extract db connection parameters from file
-        wget_url, db_user, db_host, db_port, db_name, db_pass = (
-            self.extract_params_from_string(
-                file, ["wget", "user", "host", "port", "dbname", "pass"]
-            )
+        wget_url, db_user, db_host, db_port, db_name, db_pass = self.extract_params_from_string(
+            xref_file, ["wget", "user", "host", "port", "dbname", "pass"]
         )
-        if not db_user:
-            db_user = "ensro"
-        if not db_port:
-            db_port = "3306"
+        db_user = db_user or "ensro"
+        db_port = db_port or "3306"
 
         # Get the species name(s)
         species_id_to_names = self.species_id_to_names(xref_dbi)
@@ -53,27 +70,34 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         species_name = species_id_to_names[species_id][0]
 
         # Connect to the appropriate rfam db
+        rfam_db_url = self.get_rfam_db_url(db_host, db_user, db_pass, db_port, db_name, dba, species_name, ensembl_release, verbose)
+        if not rfam_db_url:
+            raise AttributeError("Could not find RFAM DB.")
+        if verbose:
+            logging.info(f"Found RFAM DB: {rfam_db_url}")
+
+        # Download file through wget if url present
+        if wget_url:
+            xref_file = self.download_file(wget_url, xref_file)
+
+        # Add xrefs
+        xref_count, direct_count = self.process_lines(xref_file, rfam_db_url, source_id, species_id, xref_dbi)
+
+        result_message = f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs"
+        return 0, result_message
+
+    def get_rfam_db_url(self, db_host: str, db_user: str, db_pass: str, db_port: str, db_name: str, dba: str, species_name: str, ensembl_release: str, verbose: bool) -> Any:
         if db_host:
-            rfam_db_url = URL.create(
-                "mysql", db_user, db_pass, db_host, db_port, db_name
-            )
+            return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name)
         elif dba:
-            rfam_db_url = dba
+            return dba
         else:
             if verbose:
                 logging.info("Looking for db in mysql-ens-sta-1")
             registry = "ensro@mysql-ens-sta-1:4519"
-            rfam_db_url = self.get_db_from_registry(
-                species_name, "core", ensembl_release, registry
-            )
+            return self.get_db_from_registry(species_name, "core", ensembl_release, registry)
 
-        if not rfam_db_url:
-            raise IOError(f"Could not find RFAM DB.")
-        else:
-            if verbose:
-                logging.info(f"Found RFAM DB: {rfam_db_url}")
-
-        # Get data from rfam db
+    def get_rfam_transcript_stable_ids(self, rfam_db_url: Any) -> Dict[str, List[str]]:
         db_engine = self.get_db_engine(rfam_db_url)
         with db_engine.connect() as rfam_dbi:
             query = (
@@ -103,8 +127,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 )
                 .join(
                     DnaAlignFeatureORM,
-                    DnaAlignFeatureORM.dna_align_feature_id
-                    == SupportingFeatureORM.feature_id,
+                    DnaAlignFeatureORM.dna_align_feature_id == SupportingFeatureORM.feature_id,
                 )
                 .order_by(DnaAlignFeatureORM.hit_name)
             )
@@ -113,81 +136,61 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         # Create a dict with RFAM accessions as keys and value is an array of ensembl transcript stable_ids
         rfam_transcript_stable_ids = {}
         for row in result:
-            rfam_id = None
-
             match = re.search(r"^(RF\d+)", row.hit_name)
             if match:
                 rfam_id = match.group(1)
-
-            if rfam_id:
                 rfam_transcript_stable_ids.setdefault(rfam_id, []).append(row.stable_id)
 
-        # Download file through wget if url present
-        if wget_url:
-            uri = urlparse(wget_url)
-            file = os.path.join(os.path.dirname(file), os.path.basename(uri.path))
-            wget.download(wget_url, file)
-
-        # Read data from file
-        lines = []
-        entry = ""
-
-        file_io = gzip.open(file, "r")
-        for line in file_io:
-            line = line.decode("latin-1")
-            if re.search(r"^//", line):
-                lines.append(entry)
-                entry = ""
-            elif (
-                re.search(r"^#=GF\sAC", line)
-                or re.search(r"^#=GF\sID", line)
-                or re.search(r"^#=GF\sDE", line)
-            ):
-                entry += line
-        file_io.close()
+        return rfam_transcript_stable_ids
 
-        # Add xrefs
+    def download_file(self, wget_url: str, rfam_file: str) -> str:
+        uri = urlparse(wget_url)
+        rfam_file = os.path.join(os.path.dirname(rfam_file), os.path.basename(uri.path))
+        wget.download(wget_url, rfam_file)
+
+        return rfam_file
+
+    def process_lines(self, xref_file: str, rfam_db_url: Any, source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int]:
         xref_count, direct_count = 0, 0
 
-        for entry in lines:
-            accession, label, description = None, None, None
+        # Get data from rfam db
+        rfam_transcript_stable_ids = self.get_rfam_transcript_stable_ids(rfam_db_url)
+
+        for section in self.get_file_sections(xref_file, "//\n", "utf-8"):
+            entry = "".join(section)
 
             # Extract data from entry
-            match = re.search(r"^#=GF\sAC\s+(\w+)", entry, flags=re.MULTILINE)
-            if match:
-                accession = match.group(1)
-            match = re.search(r"^#=GF\sID\s+([^\n]+)", entry, flags=re.MULTILINE)
-            if match:
-                label = match.group(1)
-            match = re.search(r"^#=GF\sDE\s+([^\n]+)", entry, flags=re.MULTILINE)
-            if match:
-                description = match.group(1)
-
-            if accession:
-                if rfam_transcript_stable_ids.get(accession):
-                    xref_id = self.add_xref(
-                        {
-                            "accession": accession,
-                            "version": 0,
-                            "label": label or accession,
-                            "description": description,
-                            "source_id": source_id,
-                            "species_id": species_id,
-                            "info_type": "DIRECT",
-                        },
-                        xref_dbi,
-                    )
-                    xref_count += 1
-
-                    transcript_stable_ids = rfam_transcript_stable_ids[accession]
-                    for stable_id in transcript_stable_ids:
-                        self.add_direct_xref(
-                            xref_id, stable_id, "Transcript", "", xref_dbi
-                        )
-                        direct_count += 1
-
-        result_message = (
-            f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs"
-        )
+            accession, label, description = self.extract_entry_data(entry)
+
+            if accession and rfam_transcript_stable_ids.get(accession):
+                print("accession in dict")
+                xref_id = self.add_xref(
+                    {
+                        "accession": accession,
+                        "version": 0,
+                        "label": label or accession,
+                        "description": description,
+                        "source_id": source_id,
+                        "species_id": species_id,
+                        "info_type": "DIRECT",
+                    },
+                    xref_dbi,
+                )
+                xref_count += 1
 
-        return 0, result_message
+                for stable_id in rfam_transcript_stable_ids[accession]:
+                    self.add_direct_xref(xref_id, stable_id, "Transcript", "", xref_dbi)
+                    direct_count += 1
+
+        return xref_count, direct_count
+
+    def extract_entry_data(self, entry: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        accession = self.extract_pattern(entry, self.ACCESSION_PATTERN)
+        label = self.extract_pattern(entry, self.LABEL_PATTERN)
+        description = self.extract_pattern(entry, self.DESCRIPTION_PATTERN)
+
+        return accession, label, description
+
+    def extract_pattern(self, text: str, pattern: re.Pattern) -> Optional[str]:
+        match = pattern.search(text)
+        return match.group(1) if match else None
diff --git a/src/python/ensembl/production/xrefs/parsers/RGDParser.py b/src/python/ensembl/production/xrefs/parsers/RGDParser.py
index 11ddd0e0e..54b574e82 100644
--- a/src/python/ensembl/production/xrefs/parsers/RGDParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RGDParser.py
@@ -14,57 +14,65 @@
 
 """Parser module for RGD source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import csv
+import re
+from typing import Any, Dict, List, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class RGDParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
-        direct_source_id = self.get_source_id_for_source_name(
-            "RGD", xref_dbi, "direct_xref"
-        )
+        direct_source_id = self.get_source_id_for_source_name("RGD", xref_dbi, "direct_xref")
 
-        # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
-        preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"RGD file is empty")
+            file_io.seek(0)
 
-        rgd_io = self.get_filehandle(file)
-        csv_reader = csv.DictReader(
-            filter(lambda row: row[0] != "#", rgd_io), delimiter="\t"
+            csv_reader = csv.DictReader(filter(lambda row: row[0] != "#", file_io), delimiter="\t")
+
+            dependent_count, ensembl_count, mismatch_count, syn_count = self.process_lines(csv_reader, source_id, direct_source_id, species_id, xref_dbi)
+
+        result_message = (
+            f"{dependent_count} xrefs successfully loaded and dependent on refseq\n"
+            f"\t{mismatch_count} xrefs added but with NO dependencies\n"
+            f"\t{ensembl_count} direct xrefs successfully loaded\n"
+            f"\tAdded {syn_count} synonyms, including duplicates"
         )
 
-        header_found, count, ensembl_count, mismatch, syn_count = 0, 0, 0, 0, 0
-        columns = {}
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.DictReader, source_id: int, direct_source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int, int, int]:
+        dependent_count, ensembl_count, mismatch_count, syn_count = 0, 0, 0, 0
 
-        # Read lines
-        for line in csv_reader:
-            # Don't bother doing anything if we don't have an RGD ID
-            if not line.get("GENE_RGD_ID") or not line["GENE_RGD_ID"]:
-                continue
+        # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
+        preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
 
-            # Some RGD annotation is directly copied from Ensembl
-            if re.search("ENSRNO", line["SYMBOL"]):
+        for line in csv_reader:
+            # Don't bother doing anything if we don't have an RGD ID or if the symbol is an Ensembl ID
+            if not line["GENE_RGD_ID"] or re.search("ENSRNO", line["SYMBOL"]):
                 continue
 
-            genbank_nucleotides = []
-            if line.get("GENBANK_NUCLEOTIDE"):
-                genbank_nucleotides = line["GENBANK_NUCLEOTIDE"].split(";")
+            genbank_nucleotides = line["GENBANK_NUCLEOTIDE"].split(";")
+            done = False
 
-            done = 0
             # The nucleotides are sorted in the file in alphabetical order. Filter them down
             # to a higher quality subset, then add dependent Xrefs where possible
             for nucleotide in self.sort_refseq_accessions(genbank_nucleotides):
-                if not done and preloaded_refseq.get(nucleotide):
-                    for xref in preloaded_refseq[nucleotide]:
+                if not done and nucleotide in preloaded_refseq:
+                    for master_xref_id in preloaded_refseq[nucleotide]:
                         xref_id = self.add_dependent_xref(
                             {
-                                "master_xref_id": xref,
+                                "master_xref_id": master_xref_id,
                                 "accession": line["GENE_RGD_ID"],
                                 "label": line["SYMBOL"],
                                 "description": line["NAME"],
@@ -73,43 +81,35 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                             },
                             xref_dbi,
                         )
+                        dependent_count += 1
 
-                        count += 1
-                        syn_count += self.process_synonyms(
-                            xref_id, line["OLD_SYMBOL"], xref_dbi
-                        )
-                        done = 1
+                        syn_count += self.process_synonyms(xref_id, line["OLD_SYMBOL"], xref_dbi)
+                        done = True
 
             # Add direct xrefs
-            if line.get("ENSEMBL_ID"):
+            if line["ENSEMBL_ID"]:
                 ensembl_ids = line["ENSEMBL_ID"].split(";")
-
-                for id in ensembl_ids:
-                    self.add_to_direct_xrefs(
+                for ensembl_id in ensembl_ids:
+                    xref_id = self.add_xref(
                         {
-                            "stable_id": id,
-                            "ensembl_type": "gene",
                             "accession": line["GENE_RGD_ID"],
                             "label": line["SYMBOL"],
                             "description": line["NAME"],
                             "source_id": direct_source_id,
                             "species_id": species_id,
+                            "info_type": "DIRECT",
                         },
                         xref_dbi,
                     )
-                    xref_id = self.get_xref_id(
-                        line["GENE_RGD_ID"], direct_source_id, species_id, xref_dbi
-                    )
-
+                    self.add_direct_xref(xref_id, ensembl_id, "gene", "", xref_dbi)
                     ensembl_count += 1
-                    syn_count += self.process_synonyms(
-                        xref_id, line["OLD_SYMBOL"], xref_dbi
-                    )
-                    done = 1
+
+                    syn_count += self.process_synonyms(xref_id, line["OLD_SYMBOL"], xref_dbi)
+                    done = True
 
             # If neither direct or dependent, add misc xref
             if not done:
-                xref_id = self.add_xref(
+                self.add_xref(
                     {
                         "accession": line["GENE_RGD_ID"],
                         "label": line["SYMBOL"],
@@ -120,35 +120,23 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                     },
                     xref_dbi,
                 )
-                mismatch += 1
-
-        rgd_io.close()
-
-        result_message = f"{count} xrefs succesfully loaded and dependent on refseq\n"
-        result_message += f"\t{mismatch} xrefs added but with NO dependencies\n"
-        result_message += f"\t{ensembl_count} direct xrefs successfully loaded\n"
-        result_message += f"\tAdded {syn_count} synonyms, including duplicates"
-
-        return 0, result_message
+                mismatch_count += 1
+        
+        return dependent_count, ensembl_count, mismatch_count, syn_count
 
     def sort_refseq_accessions(self, accessions: List[str]) -> List[str]:
         refseq_priorities = {"NM": 1, "NP": 1, "NR": 1, "XM": 2, "XP": 2, "XR": 2}
-
-        accessions = sorted(
-            [x for x in accessions if x[:2] in refseq_priorities],
-            key=lambda x: (refseq_priorities[x[:2]], x),
+        return sorted(
+            (acc for acc in accessions if acc[:2] in refseq_priorities),
+            key=lambda acc: (refseq_priorities[acc[:2]], acc),
         )
-        return accessions
 
     def process_synonyms(self, xref_id: int, synonym_string: str, dbi: Connection) -> int:
-        syn_count = 0
-
         if not synonym_string or not xref_id:
-            return syn_count
+            return 0
 
         synonyms = synonym_string.split(";")
         for synonym in synonyms:
             self.add_synonym(xref_id, synonym, dbi)
-            syn_count += 1
 
-        return syn_count
+        return len(synonyms)
diff --git a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
index 4ae9b46d8..c00df2cc4 100644
--- a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
@@ -14,38 +14,31 @@
 
 """Parser module for Reactome source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import logging
+import re
+from typing import Any, Dict, Optional, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class ReactomeParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id    = args["source_id"]
-        species_id   = args["species_id"]
-        species_name = args["species_name"]
-        file         = args["file"]
-        release_file = args["rel_file"]
-        xref_dbi     = args["xref_dbi"]
-        verbose      = args.get("verbose", False)
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        species_name = args.get("species_name")
+        xref_file = args.get("file")
+        release_file = args.get("rel_file")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         # Parse release file
         if release_file:
-            release = None
-
-            release_io = self.get_filehandle(release_file)
-            for line in release_io:
-                match = re.search(r"([0-9]*)", line)
-                if match:
-                    release = match.group(1)
-                    if verbose:
-                        logging.info(f"Reactome release is '{release}'")
-            release_io.close()
-
+            release = self.parse_release_file(release_file, verbose)
             if not release:
-                raise IOError(f"Could not find release using {release_file}")
-
+                raise ValueError(f"Could not find release using {release_file}")
             self.set_release(source_id, release, xref_dbi)
 
         # Create a hash of all valid names for this species
@@ -60,130 +53,122 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         alias_to_species_id = {alias: 1 for alias in aliases}
 
         # Get relevant source ids
-        reactome_source_id = self.get_source_id_for_source_name(
-            "reactome", xref_dbi, "direct"
-        )
-        transcript_reactome_source_id = self.get_source_id_for_source_name(
-            "reactome_transcript", xref_dbi
-        )
-        gene_reactome_source_id = self.get_source_id_for_source_name(
-            "reactome_gene", xref_dbi
-        )
-        reactome_uniprot_source_id = self.get_source_id_for_source_name(
-            "reactome", xref_dbi, "uniprot"
+        source_ids = self.get_source_ids(xref_dbi, verbose)
+
+        parsed_count, dependent_count, direct_count, error_count = self.process_file(xref_file, alias_to_species_id, source_ids, species_id, xref_dbi, verbose)
+
+        result_message = (
+            f"{parsed_count} Reactome entries processed\n"
+            f"\t{dependent_count} dependent xrefs added\n"
+            f"\t{direct_count} direct xrefs added\n"
+            f"\t{error_count} not found"
         )
+        return 0, result_message
 
-        # Cannot continue unless source ids are found
-        if (
-            not reactome_source_id
-            or not transcript_reactome_source_id
-            or not gene_reactome_source_id
-        ):
-            raise KeyError("Could not find source id for reactome sources")
-        else:
-            if verbose:
-                logging.info(f"Source_id = {reactome_source_id}")
-                logging.info(f"Transcript_source_id = {transcript_reactome_source_id}")
-                logging.info(f"Gene_source_id = {gene_reactome_source_id}")
-
-        if not reactome_uniprot_source_id:
-            raise KeyError("Could not find source id for reactome uniprot")
-        else:
-            if verbose:
-                logging.info(f"Uniprot_source_id = {reactome_uniprot_source_id}")
-
-        # Get uniprot accessions
-        is_uniprot = 0
-        uniprot_accessions = {}
-        if re.search("UniProt", file):
-            is_uniprot = 1
-            uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi)
-
-        parsed_count, err_count = 0, 0
-
-        # Read file
-        reactome_io = self.get_filehandle(file)
-
-        for line in reactome_io:
-            line = line.strip()
-
-            (ensembl_stable_id, reactome_id, url, description, evidence, species) = (
-                re.split(r"\t+", line)
-            )
-
-            # Check description pattern
-            match = re.search(
-                r"^[A-Za-z0-9_,\(\)\/\-\.:\+'&;\"\/\?%>\s\[\]]+$", description
-            )
-            if not match:
-                continue
-
-            species = re.sub(r"\s", "_", species)
-            species = species.lower()
-
-            current_source_id = reactome_source_id
-
-            if alias_to_species_id.get(species):
-                parsed_count += 1
-
-                ensembl_type = None
-                info_type = "DIRECT"
-
-                # Add uniprot dependent xrefs
-                if is_uniprot:
-                    if uniprot_accessions.get(ensembl_stable_id):
-                        for xref in uniprot_accessions[ensembl_stable_id]:
-                            xref_id = self.add_dependent_xref(
-                                {
-                                    "master_xref_id": xref,
-                                    "accession": reactome_id,
-                                    "label": reactome_id,
-                                    "description": description,
-                                    "source_id": reactome_uniprot_source_id,
-                                    "species_id": species_id,
-                                },
-                                xref_dbi,
-                            )
-                        info_type = "DEPENDENT"
-
-                # Attempt to guess the object_type based on the stable id
-                elif re.search(r"G[0-9]*$", ensembl_stable_id):
-                    ensembl_type = "gene"
-                    current_source_id = gene_reactome_source_id
-                elif re.search(r"T[0-9]*$", ensembl_stable_id):
-                    ensembl_type = "transcript"
-                    current_source_id = transcript_reactome_source_id
-                elif re.search(r"P[0-9]*$", ensembl_stable_id):
-                    ensembl_type = "translation"
-
-                # Is not in Uniprot and does not match Ensembl stable id format
-                else:
+    def parse_release_file(self, release_file: str, verbose: bool) -> Optional[str]:
+        release = None
+        with self.get_filehandle(release_file) as release_io:
+            for line in release_io:
+                match = re.search(r"([0-9]*)", line)
+                if match:
+                    release = match.group(1)
                     if verbose:
-                        logging.debug(f"Could not find type for {ensembl_stable_id}")
-                    err_count += 1
+                        logging.info(f"Reactome release is '{release}'")
+        return release
+
+    def get_source_ids(self, xref_dbi: Connection, verbose: bool) -> Tuple[int, int, int, int]:
+        reactome_source_id = self.get_source_id_for_source_name("reactome", xref_dbi, "direct")
+        transcript_reactome_source_id = self.get_source_id_for_source_name("reactome_transcript", xref_dbi)
+        gene_reactome_source_id = self.get_source_id_for_source_name("reactome_gene", xref_dbi)
+        reactome_uniprot_source_id = self.get_source_id_for_source_name("reactome", xref_dbi, "uniprot")
+
+        if verbose:
+            logging.info(f"Source_id = {reactome_source_id}")
+            logging.info(f"Transcript_source_id = {transcript_reactome_source_id}")
+            logging.info(f"Gene_source_id = {gene_reactome_source_id}")
+            logging.info(f"Uniprot_source_id = {reactome_uniprot_source_id}")
+
+        return {
+            "reactome_source_id": reactome_source_id,
+            "transcript_reactome_source_id": transcript_reactome_source_id,
+            "gene_reactome_source_id": gene_reactome_source_id,
+            "reactome_uniprot_source_id": reactome_uniprot_source_id
+        }
+
+    def process_file(self, xref_file: str, alias_to_species_id: Dict[str, int], source_ids: Dict[str, int], species_id: int, xref_dbi: Connection, verbose: bool) -> Tuple[int, int, int, int]:
+        parsed_count, dep_count, direct_count, err_count = 0, 0, 0, 0
+
+        # Get existing uniprot accessions
+        is_uniprot = bool(re.search("UniProt", xref_file))
+        uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi) if is_uniprot else {}
+
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"Reactome file is empty")
+            file_io.seek(0)
+
+            for line in file_io:
+                line = line.strip()
+                ensembl_stable_id, reactome_id, url, description, evidence, species = re.split(r"\t+", line)
+
+                # Check description pattern
+                if not re.match(r"^[A-Za-z0-9_,\(\)\/\-\.:\+'&;\"\/\?%>\s\[\]]+$", description):
                     continue
 
-                # Add new entry for reactome xref as well as direct xref to ensembl stable id
-                xref_id = self.add_xref(
-                    {
-                        "accession": reactome_id,
-                        "label": reactome_id,
-                        "description": description,
-                        "source_id": current_source_id,
-                        "species_id": species_id,
-                        "info_type": info_type,
-                    },
-                    xref_dbi,
-                )
-
-                if ensembl_type:
-                    self.add_direct_xref(
-                        xref_id, ensembl_stable_id, ensembl_type, "", xref_dbi
-                    )
-
-        reactome_io.close()
-
-        result_message = f"{parsed_count} entries processed\n"
-        result_message += f"{err_count} not found"
-
-        return 0, result_message
+                species = re.sub(r"\s", "_", species).lower()
+
+                # Continue only for current species
+                if alias_to_species_id.get(species):
+                    parsed_count += 1
+
+                    ensembl_type = None
+                    info_type = "DIRECT"
+                    current_source_id = source_ids["reactome_source_id"]
+
+                    if is_uniprot:
+                        if uniprot_accessions.get(ensembl_stable_id): # Add uniprot dependent xrefs
+                            for xref in uniprot_accessions[ensembl_stable_id]:
+                                self.add_dependent_xref(
+                                    {
+                                        "master_xref_id": xref,
+                                        "accession": reactome_id,
+                                        "label": reactome_id,
+                                        "description": description,
+                                        "source_id": source_ids["reactome_uniprot_source_id"],
+                                        "species_id": species_id,
+                                        "info_type": "DEPENDENT",
+                                    },
+                                    xref_dbi,
+                                )
+                                dep_count += 1
+                    elif re.search(r"G[0-9]*$", ensembl_stable_id): # Attempt to guess the object_type based on the stable id
+                        ensembl_type = "gene"
+                        current_source_id = source_ids["gene_reactome_source_id"]
+                    elif re.search(r"T[0-9]*$", ensembl_stable_id):
+                        ensembl_type = "transcript"
+                        current_source_id = source_ids["transcript_reactome_source_id"]
+                    elif re.search(r"P[0-9]*$", ensembl_stable_id):
+                        ensembl_type = "translation"
+                    else: # Is not in Uniprot and does not match Ensembl stable id format
+                        if verbose:
+                            logging.debug(f"Could not find type for {ensembl_stable_id}")
+                        err_count += 1
+                        continue
+
+                    # Add new entry for reactome xref as well as direct xref to ensembl stable id
+                    if ensembl_type:
+                        xref_id = self.add_xref(
+                            {
+                                "accession": reactome_id,
+                                "label": reactome_id,
+                                "description": description,
+                                "source_id": current_source_id,
+                                "species_id": species_id,
+                                "info_type": info_type,
+                            },
+                            xref_dbi,
+                        )
+                        self.add_direct_xref(xref_id, ensembl_stable_id, ensembl_type, "", xref_dbi)
+                        direct_count += 1
+
+        return parsed_count, dep_count, direct_count, err_count
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
index 14f6f76dd..61662dcaf 100644
--- a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
@@ -14,60 +14,27 @@
 
 """Parser module for RefSeq coordinate xrefs."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-from ensembl.common.RangeRegistry import RangeRegistry
+import json
+import logging
+import subprocess
+from typing import Any, Dict, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class RefSeqCoordinateParser(BaseParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id       = args["source_id"]
-        species_id      = args["species_id"]
-        species_name    = args["species_name"]
-        file            = args["file"]
-        dba             = args["dba"]
-        ensembl_release = args["ensembl_release"]
-        xref_dbi        = args["xref_dbi"]
-        verbose         = args.get("verbose", False)
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        species_name = args.get("species_name")
+        dba = args.get("dba")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
 
-        source_ids = {
-            "peptide": self.get_source_id_for_source_name(
-                "RefSeq_peptide", xref_dbi, "otherfeatures"
-            ),
-            "mrna": self.get_source_id_for_source_name(
-                "RefSeq_mRNA", xref_dbi, "otherfeatures"
-            ),
-            "ncrna": self.get_source_id_for_source_name(
-                "RefSeq_ncRNA", xref_dbi, "otherfeatures"
-            ),
-            "peptide_predicted": self.get_source_id_for_source_name(
-                "RefSeq_peptide_predicted", xref_dbi, "otherfeatures"
-            ),
-            "mrna_predicted": self.get_source_id_for_source_name(
-                "RefSeq_mRNA_predicted", xref_dbi, "otherfeatures"
-            ),
-            "ncrna_predicted": self.get_source_id_for_source_name(
-                "RefSeq_ncRNA_predicted", xref_dbi, "otherfeatures"
-            ),
-            "entrezgene": self.get_source_id_for_source_name("EntrezGene", xref_dbi),
-            "wikigene": self.get_source_id_for_source_name("WikiGene", xref_dbi),
-        }
+        if not source_id or not species_id:
+            raise AttributeError("Missing required arguments: source_id and species_id")
 
-        if verbose:
-            logging.info(f'RefSeq_peptide source ID = {source_ids["peptide"]}')
-            logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna"]}')
-            logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna"]}')
-            logging.info(
-                f'RefSeq_peptide_predicted source ID = {source_ids["peptide_predicted"]}'
-            )
-            logging.info(
-                f'RefSeq_mRNA_predicted source ID = {source_ids["mrna_predicted"]}'
-            )
-            logging.info(
-                f'RefSeq_ncRNA_predicted source ID = {source_ids["ncrna_predicted"]}'
-            )
+        source_ids = self.get_source_ids(verbose, xref_dbi)
 
         # Get the species name(s)
         species_id_to_names = self.species_id_to_names(xref_dbi)
@@ -80,17 +47,57 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         # Connect to the appropriate dbs
         if dba:
-            scripts_dir = args["perl_scripts_dir"]
-            xref_db_url = args["xref_db_url"]
-            source_ids_json = json.dumps(source_ids)
-
-            logging.info(
-                f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl"
-            )
-            perl_cmd = f"perl {scripts_dir}/refseq_coordinate_parser.pl --xref_db_url '{xref_db_url}' --core_db_url '{args['core_db_url']}' --otherf_db_url '{dba}' --source_ids '{source_ids_json}' --species_id {species_id} --species_name {species_name} --release {ensembl_release}"
-            cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE)
-
-            return 0, "Added refseq_import xrefs."
+            return self.run_perl_script(args, source_ids, species_name)
         else:
             # Not all species have an otherfeatures database, skip if not found
             return 0, f"Skipped. No otherfeatures database for '{species_name}'."
+
+    def get_source_ids(self, verbose: bool, xref_dbi: Connection) -> Dict[str, int]:
+        source_ids = {
+            "peptide": self.get_source_id_for_source_name("RefSeq_peptide", xref_dbi, "otherfeatures"),
+            "mrna": self.get_source_id_for_source_name("RefSeq_mRNA", xref_dbi, "otherfeatures"),
+            "ncrna": self.get_source_id_for_source_name("RefSeq_ncRNA", xref_dbi, "otherfeatures"),
+            "peptide_predicted": self.get_source_id_for_source_name("RefSeq_peptide_predicted", xref_dbi, "otherfeatures"),
+            "mrna_predicted": self.get_source_id_for_source_name("RefSeq_mRNA_predicted", xref_dbi, "otherfeatures"),
+            "ncrna_predicted": self.get_source_id_for_source_name("RefSeq_ncRNA_predicted", xref_dbi, "otherfeatures"),
+            "entrezgene": self.get_source_id_for_source_name("EntrezGene", xref_dbi),
+            "wikigene": self.get_source_id_for_source_name("WikiGene", xref_dbi),
+        }
+
+        if verbose:
+            logging.info(f'RefSeq_peptide source ID = {source_ids["peptide"]}')
+            logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna"]}')
+            logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna"]}')
+            logging.info(f'RefSeq_peptide_predicted source ID = {source_ids["peptide_predicted"]}')
+            logging.info(f'RefSeq_mRNA_predicted source ID = {source_ids["mrna_predicted"]}')
+            logging.info(f'RefSeq_ncRNA_predicted source ID = {source_ids["ncrna_predicted"]}')
+        
+        return source_ids
+
+    def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], species_name: str) -> Tuple[int, str]:
+        # For now, we run a perl script to add the xrefs, which has some mandatory arguments
+        scripts_dir = args.get("perl_scripts_dir")
+        xref_db_url = args.get("xref_db_url")
+        if not scripts_dir or not xref_db_url:
+            raise AttributeError("Missing required arguments: perl_scripts_dir and xref_db_url")
+
+        source_ids_json = json.dumps(source_ids)
+
+        logging.info(f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl")
+        perl_cmd = (
+            f"perl {scripts_dir}/refseq_coordinate_parser.pl "
+            f"--xref_db_url '{xref_db_url}' "
+            f"--core_db_url '{args.get('core_db_url')}' "
+            f"--otherf_db_url '{args.get('dba')}' "
+            f"--source_ids '{source_ids_json}' "
+            f"--species_id {args.get('species_id')} "
+            f"--species_name {species_name} "
+            f"--release {args.get('ensembl_release')}"
+        )
+        cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        if cmd_output.returncode != 0:
+            logging.error(f"Perl script ({scripts_dir}/refseq_coordinate_parser.pl) failed with error: {cmd_output.stderr.decode('utf-8')}")
+            return 1, "Failed to add refseq_import xrefs."
+
+        return 0, "Added refseq_import xrefs."
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py
deleted file mode 100644
index 93d773270..000000000
--- a/src/python/ensembl/production/xrefs/parsers/RefSeqGPFFParser.py
+++ /dev/null
@@ -1,341 +0,0 @@
-#  See the NOTICE file distributed with this work for additional information
-#  regarding copyright ownership.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-"""Parser module for RefSeq sources (dna and peptide)."""
-
-from ensembl.production.xrefs.parsers.BaseParser import *
-
-
-class RefSeqGPFFParser(BaseParser):
-    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id    = args["source_id"]
-        species_id   = args["species_id"]
-        species_name = args["species_name"]
-        file         = args["file"]
-        release_file = args["rel_file"]
-        xref_dbi     = args["xref_dbi"]
-        verbose      = args.get("verbose", False)
-
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
-
-        # Get needed source ids
-        source_ids = {
-            "peptide_source_id": self.get_source_id_for_source_name(
-                "RefSeq_peptide", xref_dbi
-            ),
-            "mrna_source_id": self.get_source_id_for_source_name(
-                "RefSeq_mRNA", xref_dbi, "refseq"
-            ),
-            "ncrna_source_id": self.get_source_id_for_source_name(
-                "RefSeq_ncRNA", xref_dbi
-            ),
-            "pred_peptide_source_id": self.get_source_id_for_source_name(
-                "RefSeq_peptide_predicted", xref_dbi
-            ),
-            "pred_mrna_source_id": self.get_source_id_for_source_name(
-                "RefSeq_mRNA_predicted", xref_dbi, "refseq"
-            ),
-            "pred_ncrna_source_id": self.get_source_id_for_source_name(
-                "RefSeq_ncRNA_predicted", xref_dbi
-            ),
-            "entrez_source_id": self.get_source_id_for_source_name(
-                "EntrezGene", xref_dbi
-            ),
-            "wiki_source_id": self.get_source_id_for_source_name("WikiGene", xref_dbi),
-        }
-
-        if verbose:
-            logging.info(
-                f'RefSeq_peptide source ID = {source_ids["peptide_source_id"]}'
-            )
-            logging.info(f'RefSeq_mRNA source ID = {source_ids["mrna_source_id"]}')
-            logging.info(f'RefSeq_ncRNA source ID = {source_ids["ncrna_source_id"]}')
-            logging.info(
-                f'RefSeq_peptide_predicted source ID = {source_ids["pred_peptide_source_id"]}'
-            )
-            logging.info(
-                f'RefSeq_mRNA_predicted source ID = {source_ids["pred_mrna_source_id"]}'
-            )
-            logging.info(
-                f'RefSeq_ncRNA_predicted source ID = {source_ids["pred_ncrna_source_id"]}'
-            )
-            logging.info(f'EntrezGene source ID = {source_ids["entrez_source_id"]}')
-            logging.info(f'WikiGene source ID = {source_ids["wiki_source_id"]}')
-
-        # Extract version from release file
-        if release_file:
-            # Parse and set release info
-            index = 0
-            for section in self.get_file_sections(release_file, "***"):
-                index += 1
-                if index == 2:
-                    release = "".join(section)
-                    release = re.sub(r"\s{2,}", " ", release)
-                    release = release.strip()
-                    release = re.sub(
-                        r".*(NCBI Reference Sequence.*) Distribution.*", r"\1", release
-                    )
-                    release = re.sub(r"Release (\d+)", r"Release \1,", release)
-                    break
-
-            # Set releases
-            self.set_release(source_ids["peptide_source_id"], release, xref_dbi)
-            self.set_release(source_ids["mrna_source_id"], release, xref_dbi)
-            self.set_release(source_ids["ncrna_source_id"], release, xref_dbi)
-            self.set_release(source_ids["pred_mrna_source_id"], release, xref_dbi)
-            self.set_release(source_ids["pred_ncrna_source_id"], release, xref_dbi)
-            self.set_release(source_ids["pred_peptide_source_id"], release, xref_dbi)
-
-        result_message = self.create_xrefs(
-            source_ids, species_id, species_name, file, xref_dbi
-        )
-
-        return 0, result_message
-
-    def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name: str, file: str, dbi: Connection) -> str:
-        counts = {
-            "num_mrna": 0,
-            "num_ncrna": 0,
-            "num_pred_mrna": 0,
-            "num_pred_ncrna": 0,
-            "num_peptide": 0,
-            "num_pred_peptide": 0,
-            "num_entrez": 0,
-            "num_wiki": 0,
-        }
-
-        # Create a dict of all valid names for this species
-        species_id_to_names = self.species_id_to_names(dbi)
-        if species_name:
-            species_id_to_names.setdefault(species_id, []).append(species_name)
-        if not species_id_to_names.get(species_id):
-            return "Skipped. Could not find species ID to name mapping"
-        names = species_id_to_names[species_id]
-        name_to_species_id = {name: species_id for name in names}
-
-        # Create a dict of all valid taxon_ids for this species
-        species_id_to_tax = self.species_id_to_taxonomy(dbi)
-        species_id_to_tax.setdefault(species_id, []).append(species_id)
-        tax_ids = species_id_to_tax[species_id]
-        tax_to_species_id = {tax_id: species_id for tax_id in tax_ids}
-
-        # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs
-        entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi)
-        refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi)
-        refseq_ids.update(
-            self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi)
-        )
-        entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi)
-        wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi)
-
-        # Get file type
-        file_type = self.type_from_file(os.path.basename(file))
-        if not file_type:
-            return f"Could not work out sequence type for {file}"
-
-        xrefs = []
-
-        # Read file
-        for section in self.get_file_sections(file, "//\n"):
-            if len(section) == 1:
-                continue
-
-            entry = "".join(section)
-            xref = {}
-
-            # Extract the species name
-            species_id_check = None
-            match = re.search(r"\s+ORGANISM\s+(.*)\n", entry)
-            if match:
-                species = match.group(1).lower()
-                species = re.sub(r"^\s*", "", species)
-                species = re.sub(r"\s*\(.+\)", "", species)
-                species = re.sub(r"\s+", "_", species)
-                species = re.sub(r"\n", "", species)
-
-                species_id_check = name_to_species_id[species]
-
-            # Try going through the taxon ID if species check didn't work
-            if not species_id_check:
-                match = re.search(r"db_xref=\"taxon:(\d+)\"", entry)
-                if match:
-                    taxon_id = match.group(1)
-                    species_id_check = tax_to_species_id[taxon_id]
-
-            # Skip xrefs for species that aren't in the species table
-            if not species_id_check or species_id != species_id_check:
-                continue
-
-            # Extract accession and version
-            accession = re.search(
-                r"^ACCESSION\s+(\S+)", entry, flags=re.MULTILINE
-            ).group(1)
-            version = re.search(r"^VERSION\s+(\S+)", entry, flags=re.MULTILINE).group(1)
-
-            # Get the right source ID based on file type and whether this is predicted (X*) or not
-            source_id = 0
-            if file_type == "dna":
-                if re.search(r"^XM_", accession):
-                    source_id = source_ids["pred_mrna_source_id"]
-                    counts["num_pred_mrna"] += 1
-                elif re.search(r"^XR", accession):
-                    source_id = source_ids["pred_ncrna_source_id"]
-                    counts["num_pred_ncrna"] += 1
-                elif re.search(r"^NM", accession):
-                    source_id = source_ids["mrna_source_id"]
-                    counts["num_mrna"] += 1
-                elif re.search(r"^NR", accession):
-                    source_id = source_ids["ncrna_source_id"]
-                    counts["num_ncrna"] += 1
-            elif file_type == "peptide":
-                if re.search(r"^XP_", accession):
-                    source_id = source_ids["pred_peptide_source_id"]
-                    counts["num_pred_peptide"] += 1
-                else:
-                    source_id = source_ids["peptide_source_id"]
-                    counts["num_peptide"] += 1
-
-            if not source_id:
-                logging.warning(
-                    f"Could not get source ID for file type {file_type} for accession {accession}"
-                )
-
-            (acc_no_version, version) = version.split(".")
-            xref["ACCESSION"] = accession
-            if accession == acc_no_version:
-                xref["VERSION"] = version
-
-            # Extract description (may be multi-line)
-            description = re.search(
-                r"^DEFINITION\s+([^[]+)", entry, flags=re.MULTILINE
-            ).group(1)
-            description = re.sub(r"\nACCESSION.*", "", description, flags=re.DOTALL)
-            description = re.sub(r"\n", "", description)
-            description = re.sub(r"{.*}-like", "", description)
-            description = re.sub(r"{.*}", "", description)
-            description = re.sub(r"\s+", " ", description)
-            if len(description) > 255:
-                description = description[0:255]
-
-            # Extract sequence
-            sequence = re.search(
-                r"^\s*ORIGIN\s+(.+)", entry, flags=re.DOTALL | re.MULTILINE
-            ).group(1)
-            sequence_lines = sequence.split("\n")
-            parsed_sequence = ""
-            for seq_line in sequence_lines:
-                if seq_line:
-                    sequence_only = re.search(r"^\s*\d+\s+(.*)$", seq_line).group(1)
-                    if not sequence_only:
-                        continue
-                    parsed_sequence += sequence_only
-            parsed_sequence = re.sub(r"\s", "", parsed_sequence)
-
-            # Extract related pair to current RefSeq accession
-            # For rna file, the pair is the protein_id
-            # For peptide file, the pair is in DBSOURCE REFSEQ accession
-            refseq_pair = None
-            match = re.search(r"DBSOURCE\s+REFSEQ: accession (\S+)", entry)
-            if match:
-                refseq_pair = match.group(1)
-            protein_id = re.findall(r"\/protein_id=.(\S+_\d+)", entry)
-            coded_by = re.findall(r"\/coded_by=.(\w+_\d+)", entry)
-
-            for cb in coded_by:
-                xref["PAIR"] = cb
-
-            if not xref.get("PAIR"):
-                xref["PAIR"] = refseq_pair
-
-            if not xref.get("PAIR"):
-                for pi in protein_id:
-                    xref["PAIR"] = pi
-
-            xref["LABEL"] = f"{accession}.{version}"
-            xref["DESCRIPTION"] = description
-            xref["SOURCE_ID"] = source_id
-            xref["SEQUENCE"] = parsed_sequence
-            xref["SEQUENCE_TYPE"] = file_type
-            xref["SPECIES_ID"] = species_id
-            xref["INFO_TYPE"] = "SEQUENCE_MATCH"
-            xref["DEPENDENT_XREFS"] = []
-
-            # Extrat NCBIGene ids
-            seen_in_record = {}
-            ncbi_gene_ids = re.findall(r"db_xref=.GeneID:(\d+)", entry)
-            for gene_id in ncbi_gene_ids:
-                if not seen_in_record.get(gene_id) and entrez_acc_to_label.get(gene_id):
-                    seen_in_record[gene_id] = 1
-
-                    dependent = {}
-                    dependent["SOURCE_ID"] = source_ids["entrez_source_id"]
-                    dependent["LINKAGE_SOURCE_ID"] = source_id
-                    dependent["ACCESSION"] = gene_id
-                    dependent["LABEL"] = entrez_acc_to_label[gene_id]
-                    xref["DEPENDENT_XREFS"].append(dependent)
-                    counts["num_entrez"] += 1
-
-                    dependent = {}
-                    dependent["SOURCE_ID"] = source_ids["wiki_source_id"]
-                    dependent["LINKAGE_SOURCE_ID"] = source_id
-                    dependent["ACCESSION"] = gene_id
-                    dependent["LABEL"] = entrez_acc_to_label[gene_id]
-                    xref["DEPENDENT_XREFS"].append(dependent)
-                    counts["num_wiki"] += 1
-
-                    # Add xrefs for RefSeq mRNA as well where available
-                    if refseq_pair:
-                        refseq_pair = re.sub(r"\.[0-9]*", "", refseq_pair)
-                    if refseq_pair:
-                        if refseq_ids.get(refseq_pair):
-                            for refseq_id in refseq_ids[refseq_pair]:
-                                for entrez_id in entrez_ids.get(gene_id):
-                                    self.add_dependent_xref_maponly(
-                                        entrez_id,
-                                        source_ids["entrez_source_id"],
-                                        refseq_id,
-                                        None,
-                                        dbi,
-                                    )
-                                for wiki_id in wiki_ids.get(gene_id):
-                                    self.add_dependent_xref_maponly(
-                                        wiki_id,
-                                        source_ids["entrez_source_id"],
-                                        refseq_id,
-                                        None,
-                                        dbi,
-                                    )
-
-            xrefs.append(xref)
-
-        if len(xrefs) > 0:
-            self.upload_xref_object_graphs(xrefs, dbi)
-
-        result_message = f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, {counts["num_ncrna"]} ncRNA xrefs, {counts["num_pred_ncrna"]} predicted ncRNA xrefs, {counts["num_peptide"]} peptide xrefs, and {counts["num_pred_peptide"]} predicted peptide xrefs\n'
-        result_message += f"Added the following dependent xrefs:\n"
-        result_message += f'\tEntrezGene\t{counts["num_entrez"]}\n'
-        result_message += f'\tWikiGene\t{counts["num_wiki"]}\n'
-
-        return result_message
-
-    def type_from_file(self, file_name: str) -> Optional[str]:
-        if re.search("RefSeq_protein", file_name):
-            return "peptide"
-        if re.search("rna", file_name):
-            return "dna"
-        if re.search("protein", file_name):
-            return "peptide"
-
-        return None
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py
new file mode 100644
index 000000000..f9e62c218
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py
@@ -0,0 +1,316 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Parser module for RefSeq sources (dna and peptide)."""
+
+import os
+import re
+import logging
+from typing import Any, Dict, Optional, Tuple
+from sqlalchemy.engine import Connection
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
+
+class RefSeqParser(BaseParser):
+    ORGAMISM_PATTERN = re.compile(r"\s+ORGANISM\s+(.*)\n")
+    TAXON_PATTERN = re.compile(r"db_xref=\"taxon:(\d+)\"")
+    ACCESSION_PATTERN = re.compile(r"^ACCESSION\s+(\S+)", re.MULTILINE)
+    VERSION_PATTERN = re.compile(r"^VERSION\s+(\S+)", re.MULTILINE)
+    TYPE_PATTERNS = {
+        "dna": {
+            re.compile(r"^XM_"): ("num_pred_mrna", "pred_mrna_source_id"),
+            re.compile(r"^XR"): ("num_pred_ncrna", "pred_ncrna_source_id"),
+            re.compile(r"^NM"): ("num_mrna", "mrna_source_id"),
+            re.compile(r"^NR"): ("num_ncrna", "ncrna_source_id"),
+        },
+        "peptide": {
+            re.compile(r"^XP_"): ("num_pred_peptide", "pred_peptide_source_id"),
+        }
+    }
+    DESCRIPTION_PATTERN = re.compile(r"^DEFINITION\s+(.+?)(?=\n\S)", re.DOTALL | re.MULTILINE)
+    DESC_REMOVE_BRACES_PATTERN = re.compile(r"\{.*?\}-like|\{.*?\}")
+    NORMALIZE_WHITESPACE_PATTERN = re.compile(r"\s+")
+    SEQUENCE_PATTERN = re.compile(r"^\s*ORIGIN\s+(.+)", re.DOTALL | re.MULTILINE)
+    SEQ_REMOVE_NUMBERS_PATTERN = re.compile(r"\d+\s+")
+    PROTEIN_IDS_PATTERN = re.compile(r"\/protein_id=.(\S+_\d+)")
+    CODED_BY_PATTERN = re.compile(r"\/coded_by=.(\w+_\d+)")
+    DBSOURCE_PATTERN = re.compile(r"^DBSOURCE\s+REFSEQ: accession (\S+_\d+)")
+    GENEID_PATTERN = re.compile(r"db_xref=.GeneID:(\d+)")
+
+    def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        species_name = args.get("species_name")
+        xref_file = args.get("file")
+        release_file = args.get("rel_file")
+        xref_dbi = args.get("xref_dbi")
+        verbose = args.get("verbose", False)
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        # Get needed source ids
+        source_ids = {
+            "peptide_source_id": self.get_source_id_for_source_name("RefSeq_peptide", xref_dbi),
+            "mrna_source_id": self.get_source_id_for_source_name("RefSeq_mRNA", xref_dbi, "refseq"),
+            "ncrna_source_id": self.get_source_id_for_source_name("RefSeq_ncRNA", xref_dbi),
+            "pred_peptide_source_id": self.get_source_id_for_source_name("RefSeq_peptide_predicted", xref_dbi),
+            "pred_mrna_source_id": self.get_source_id_for_source_name("RefSeq_mRNA_predicted", xref_dbi, "refseq"),
+            "pred_ncrna_source_id": self.get_source_id_for_source_name("RefSeq_ncRNA_predicted", xref_dbi),
+            "entrez_source_id": self.get_source_id_for_source_name("EntrezGene", xref_dbi),
+            "wiki_source_id": self.get_source_id_for_source_name("WikiGene", xref_dbi),
+        }
+
+        if verbose:
+            for key, value in source_ids.items():
+                logging.info(f'{key} = {value}')
+
+        # Extract version from release file
+        if release_file:
+            release = self.extract_release_info(release_file)
+
+            if release:
+                if verbose:
+                    logging.info(f"RefSeq release info: {release}")
+
+                for key in ["peptide_source_id", "mrna_source_id", "ncrna_source_id", "pred_mrna_source_id", "pred_ncrna_source_id", "pred_peptide_source_id"]:
+                    self.set_release(source_ids[key], release, xref_dbi)
+
+        result_message = self.create_xrefs(source_ids, species_id, species_name, xref_file, xref_dbi)
+        return 0, result_message
+
+    def extract_release_info(self, release_file: str) -> str:
+        release_info = ""
+        for section in self.get_file_sections(release_file, "***"):
+            release_info = "".join(section)
+            break
+
+        match = re.search(r"(NCBI Reference Sequence.*?)(?=Distribution)", release_info, re.DOTALL)
+        if match:
+            release = " ".join(match.group(1).split())
+            release = re.sub(r"Release (\d+)", r"Release \1,", release)
+            return release
+        
+        return None
+
+    def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name: str, xref_file: str, dbi: Connection) -> str:
+        counts = {
+            "num_mrna": 0,
+            "num_ncrna": 0,
+            "num_pred_mrna": 0,
+            "num_pred_ncrna": 0,
+            "num_peptide": 0,
+            "num_pred_peptide": 0,
+            "num_entrez": 0,
+            "num_wiki": 0,
+        }
+
+        # Create a dict of all valid names for this species
+        species_id_to_names = self.species_id_to_names(dbi)
+        if species_name:
+            species_id_to_names.setdefault(species_id, []).append(species_name)
+        if not species_id_to_names.get(species_id):
+            return "Skipped. Could not find species ID to name mapping"
+        names = species_id_to_names[species_id]
+        name_to_species_id = {name: species_id for name in names}
+
+        # Create a dict of all valid taxon_ids for this species
+        species_id_to_tax = self.species_id_to_taxonomy(dbi)
+        species_id_to_tax.setdefault(species_id, []).append(species_id)
+        tax_ids = species_id_to_tax[species_id]
+        tax_to_species_id = {tax_id: species_id for tax_id in tax_ids}
+
+        # Get file type
+        file_type = self.type_from_file(os.path.basename(xref_file))
+        if not file_type:
+            return f"Skipped. Could not work out sequence type for {xref_file}"
+
+        # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs
+        entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi)
+        refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi)
+        refseq_ids.update(self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi))
+        entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi)
+        wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi)
+
+        xrefs = []
+
+        # Read file
+        for section in self.get_file_sections(xref_file, "//\n"):
+            entry = "".join(section)
+            xref = {}
+
+            # Extract the species name and check species ID
+            species_id_check = self.check_species(entry, name_to_species_id, tax_to_species_id)
+
+            # Skip xrefs for species that don't pass the check
+            if not species_id_check or species_id != species_id_check:
+                continue
+
+            # Extract accession
+            accession = self.ACCESSION_PATTERN.search(entry).group(1)
+            xref["ACCESSION"] = accession
+
+            # Get the right source ID based on file type and whether this is predicted (X*) or not
+            source_id = self.get_source_id_for_accession(accession, file_type, source_ids, counts)
+            # result_message += f"{accession}--{source_id}|"
+
+            if not source_id:
+                logging.warning(f"Could not get source ID for file type {file_type} for accession {accession}")
+                continue
+
+            # Extract and fix the version
+            version = self.VERSION_PATTERN.search(entry).group(1)
+            acc_no_version, version = version.split(".", 1) if "." in version else (version, None)
+            if acc_no_version == accession and version is not None:
+                xref["VERSION"] = version
+
+            # Extract description (may be multi-line)
+            description = self.extract_description(entry)
+
+            # Extract sequence
+            parsed_sequence = self.extract_sequence(entry)
+
+            # Extract related pair to current RefSeq accession
+            # - for rna file, the pair is the protein_id
+            # - for peptide file, the pair is in DBSOURCE REFSEQ accession or in the coded_by
+            xref["PAIR"] = self.extract_refseq_pair(file_type, entry)
+
+            # Build the xref fields
+            xref["LABEL"] = f"{accession}.{version}"
+            xref["DESCRIPTION"] = description
+            xref["SOURCE_ID"] = source_id
+            xref["SEQUENCE"] = parsed_sequence
+            xref["SEQUENCE_TYPE"] = file_type
+            xref["SPECIES_ID"] = species_id
+            xref["INFO_TYPE"] = "SEQUENCE_MATCH"
+            xref["DEPENDENT_XREFS"] = []
+
+            # Extract NCBIGene ids
+            seen_in_record = {}
+            ncbi_gene_ids = self.GENEID_PATTERN.findall(entry)
+            for gene_id in ncbi_gene_ids:
+                if gene_id not in seen_in_record and gene_id in entrez_acc_to_label:
+                    seen_in_record[gene_id] = True
+                    entrez_label = entrez_acc_to_label[gene_id]
+
+                    self.add_dependents(xref, gene_id, source_ids, entrez_label, counts)
+
+                    if file_type == "peptide" and xref['PAIR']:
+                        if refseq_ids.get(xref['PAIR']):
+                            for refseq_id in refseq_ids[xref['PAIR']]:
+                                for entrez_id in entrez_ids.get(gene_id, []):
+                                    self.add_dependent_xref_maponly(entrez_id, source_ids["entrez_source_id"], refseq_id, None, dbi)
+                                for wiki_id in wiki_ids.get(gene_id, []):
+                                    self.add_dependent_xref_maponly(wiki_id, source_ids["wiki_source_id"], refseq_id, None, dbi)
+
+            xrefs.append(xref)
+
+        if xrefs:
+            self.upload_xref_object_graphs(xrefs, dbi)
+
+        result_message = (
+            f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, '
+            f'{counts["num_ncrna"]} ncRNA xrefs, {counts["num_pred_ncrna"]} predicted ncRNA xrefs, '
+            f'{counts["num_peptide"]} peptide xrefs, and {counts["num_pred_peptide"]} predicted peptide xrefs\n'
+            f'Added the following dependent xrefs:\n'
+            f'\tEntrezGene\t{counts["num_entrez"]}\n'
+            f'\tWikiGene\t{counts["num_wiki"]}\n'
+        )
+
+        return result_message
+
+    def check_species(self, entry: str, name_to_species_id: Dict[str, int], tax_to_species_id: Dict[int, int]) -> Optional[int]:
+        species_id_check = None
+
+        match = self.ORGAMISM_PATTERN.search(entry)
+        if match:
+            species = match.group(1).lower().strip()
+            species = re.sub(r"\s*\(.+\)", "", species)
+            species = re.sub(r"\s+", "_", species)
+            species_id_check = name_to_species_id.get(species)
+
+        # Try going through the taxon ID if species check didn't work
+        if not species_id_check:
+            match = self.TAXON_PATTERN.search(entry)
+            if match:
+                taxon_id = int(match.group(1))
+                species_id_check = tax_to_species_id.get(taxon_id)
+
+        return species_id_check
+
+    def get_source_id_for_accession(self, accession: str, file_type: str, source_ids: Dict[str, int], counts: Dict[str, int]) -> int:
+        # Check for dna or peptide patterns
+        if file_type in self.TYPE_PATTERNS:
+            for pattern, (count_key, source_id_key) in self.TYPE_PATTERNS[file_type].items():
+                if pattern.search(accession):
+                    counts[count_key] += 1
+                    return source_ids[source_id_key]
+
+        # Default case for peptide
+        if file_type == "peptide":
+            counts["num_peptide"] += 1
+            return source_ids["peptide_source_id"]
+
+        return 0
+
+    def extract_description(self, entry: str) -> str:
+        description = self.DESCRIPTION_PATTERN.search(entry).group(1).strip()
+        description = self.DESC_REMOVE_BRACES_PATTERN.sub("", description)
+        description = self.NORMALIZE_WHITESPACE_PATTERN.sub(" ", description)
+
+        return description[:255].strip()
+
+    def extract_sequence(self, entry: str) -> str:
+        sequence = self.SEQUENCE_PATTERN.search(entry).group(1)
+        sequence = self.SEQ_REMOVE_NUMBERS_PATTERN.sub("", sequence)
+        parsed_sequence = "".join(self.NORMALIZE_WHITESPACE_PATTERN.sub("", seq_line) for seq_line in sequence.split("\n") if seq_line)
+
+        return parsed_sequence
+
+    def extract_refseq_pair(self, file_type:str, entry: str) -> Optional[str]:
+        if file_type == "dna":
+            protein_ids = self.PROTEIN_IDS_PATTERN.findall(entry)
+            if protein_ids:
+                return protein_ids[-1]
+        elif file_type == "peptide":
+            coded_by = self.CODED_BY_PATTERN.findall(entry)
+            if coded_by:
+                return coded_by[-1]
+
+            match = self.DBSOURCE_PATTERN.search(entry)
+            if match:
+                return match.group(1)
+
+        return None
+
+    def add_dependents(self, xref: Dict[str, Any], gene_id: str, source_ids: Dict[str, int], label: str, counts: Dict[str, int]) -> None:
+        # Add EntrezGene and WikiGene dependent xrefs
+        for source_key in ["entrez", "wiki"]:
+            dependent = {
+                "SOURCE_ID": source_ids[f"{source_key}_source_id"],
+                "LINKAGE_SOURCE_ID": xref["SOURCE_ID"],
+                "ACCESSION": gene_id,
+                "LABEL": label,
+            }
+            xref["DEPENDENT_XREFS"].append(dependent)
+            counts[f"num_{source_key}"] += 1
+
+    def type_from_file(self, file_name: str) -> Optional[str]:
+        if re.search("RefSeq_protein", file_name):
+            return "peptide"
+        if re.search("rna", file_name):
+            return "dna"
+        if re.search("protein", file_name):
+            return "peptide"
+        return None
diff --git a/src/python/ensembl/production/xrefs/parsers/UCSCParser.py b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py
index 5de152912..5759b782c 100644
--- a/src/python/ensembl/production/xrefs/parsers/UCSCParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/UCSCParser.py
@@ -14,49 +14,73 @@
 
 """Parser module for UCSC source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import csv
+import re
+from typing import Any, Dict, Tuple
+from sqlalchemy.sql import insert
+from sqlalchemy.engine import Connection
 
+from ensembl.xrefs.xref_update_db_model import CoordinateXref as CoordinateXrefORM
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class UCSCParser(BaseParser):
+    CHROMOSOME_PATTERN = re.compile(r"\Achr")
+    EXON_PATTERN = re.compile(r",\Z")
+    EXON_SPLIT_PATTERN = re.compile(r"\s*,\s*")
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
-        count = 0
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"UCSC file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.reader(file_io, delimiter="\t", strict=True)
+
+            count = self.process_lines(csv_reader, source_id, species_id, xref_dbi)
+
+        result_message = f"Loaded a total of {count} UCSC xrefs"
+        return 0, result_message
 
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(file_io, delimiter="\t", strict=True)
+    def process_lines(self, csv_reader: csv.reader, source_id: int, species_id: int, xref_dbi: Connection) -> int:
+        count = 0
 
         # Read lines
         for line in csv_reader:
-            chromosome = line[1]
-            strand = line[2]
-            tx_start = int(line[3])
-            tx_end = int(line[4])
-            cds_start = int(line[5])
-            cds_end = int(line[6])
-            exon_starts = line[8]
-            exon_ends = line[9]
-            accession = line[11]
+            try:
+                chromosome = line[1].strip()
+                strand = line[2].strip()
+                exon_starts = line[8].strip()
+                exon_ends = line[9].strip()
+                accession = line[11].strip()
+
+                tx_start = int(line[3]) if line[3].strip() else None
+                tx_end = int(line[4]) if line[4].strip() else None
+                cds_start = int(line[5]) if line[5].strip() else None
+                cds_end = int(line[6]) if line[6].strip() else None
+
+                # Check for required keys
+                if not accession or not chromosome or not strand or tx_start is None or tx_end is None or not exon_starts or not exon_ends:
+                    raise ValueError("Missing required key for xref")
+            except (IndexError, ValueError) as e:
+                raise ValueError(f"Error processing line {line}: {e}")
 
             # UCSC uses slightly different chromosome names, at least for
             # human and mouse, so chop off the 'chr' in the beginning.  We do
             # not yet translate the names of the special chromosomes, e.g.
             # "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl)
-            chromosome = re.sub(r"\Achr", "", chromosome)
+            chromosome = self.CHROMOSOME_PATTERN.sub("", chromosome)
 
             # They also use '+' and '-' for the strand, instead of -1, 0, or 1
-            if strand == "+":
-                strand = 1
-            elif strand == "-":
-                strand = -1
-            else:
-                strand = 0
+            strand = 1 if strand == "+" else -1 if strand == "-" else 0
 
             # ... and non-coding transcripts have cds_start == cds_end.
             # We would like these to be stored as NULLs
@@ -65,8 +89,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 cds_end = None
 
             # exon_starts and exon_ends usually have trailing commas, remove them
-            exon_starts = re.sub(r",\Z", "", exon_starts)
-            exon_ends = re.sub(r",\Z", "", exon_ends)
+            exon_starts = self.EXON_PATTERN.sub("", exon_starts)
+            exon_ends = self.EXON_PATTERN.sub("", exon_ends)
 
             # ... and they use the same kind of "inbetween" coordinates as e.g.
             # exonerate, so increment all start coordinates by one
@@ -80,57 +104,24 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
             # element of the resulting array, then join the result into a new
             # comma-separated list
             exon_starts = ",".join(
-                str(int(x) + 1) for x in re.split(r"\s*,\s*", exon_starts)
+                str(int(x) + 1) for x in self.EXON_SPLIT_PATTERN.split(exon_starts)
             )
 
-            self.add_xref(
-                source_id,
-                species_id,
-                {
-                    "accession": accession,
-                    "chromosome": chromosome,
-                    "strand": strand,
-                    "txStart": tx_start,
-                    "txEnd": tx_end,
-                    "cdsStart": cds_start,
-                    "cdsEnd": cds_end,
-                    "exonStarts": exon_starts,
-                    "exonEnds": exon_ends,
-                },
-                xref_dbi,
+            # Add coordinate xref
+            query = insert(CoordinateXrefORM).values(
+                source_id=source_id,
+                species_id=species_id,
+                accession=accession,
+                chromosome=chromosome,
+                strand=strand,
+                txStart=tx_start,
+                txEnd=tx_end,
+                cdsStart=cds_start,
+                cdsEnd=cds_end,
+                exonStarts=exon_starts,
+                exonEnds=exon_ends,
             )
+            xref_dbi.execute(query)
             count += 1
 
-        file_io.close()
-
-        result_message = f"Loaded a total of {count} UCSC xrefs"
-
-        return 0, result_message
-
-    def add_xref(self, source_id: int, species_id: int, xref: Dict[str, Any], dbi: Connection) -> None:
-        for required_key in [
-            "accession",
-            "chromosome",
-            "strand",
-            "txStart",
-            "txEnd",
-            "exonStarts",
-            "exonEnds",
-        ]:
-            if not xref.get(required_key):
-                raise KeyError(f"Missing required key {required_key} for Xref")
-
-        query = insert(CoordinateXrefORM).values(
-            source_id=source_id,
-            species_id=species_id,
-            accession=xref["accession"],
-            chromosome=xref["chromosome"],
-            strand=xref["strand"],
-            txStart=xref["txStart"],
-            txEnd=xref["txEnd"],
-            cdsStart=xref["cdsStart"],
-            cdsEnd=xref["cdsEnd"],
-            exonStarts=xref["exonStarts"],
-            exonEnds=xref["exonEnds"],
-        )
-        dbi.execute(query)
+        return count
\ No newline at end of file
diff --git a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
index e99b33cdc..1886c6fc6 100644
--- a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
@@ -14,100 +14,112 @@
 
 """Parser module for Uniprot sources."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
-
+import re
+import logging
+import csv
 import codecs
+from typing import Dict, Any, Tuple, List
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class UniProtParser(BaseParser):
+    SWISSPROT_RELEASE_PATTERN = re.compile(r"(UniProtKB/Swiss-Prot Release .*)")
+    TREMBL_RELEASE_PATTERN = re.compile(r"(UniProtKB/TrEMBL Release .*)")
+    TAXON_PATTERN = re.compile(r"[a-zA-Z_]+=([0-9 ,]+).*;")
+    CAUTION_PATTERN = re.compile(r"CAUTION: The sequence shown here is derived from an Ensembl")
+    SP_TYPE_PATTERN = re.compile(r"(\w+)\s+(\w+)")
+    PROTEIN_EVIDENCE_PATTERN = re.compile(r"(\d+)")
+    VERSION_PATTERN = re.compile(r"\d+-\w+-\d+, entry version (\d+)")
+    REVIEWED_PATTERN = re.compile(r"^Reviewed", re.IGNORECASE)
+    UNREVIEWED_PATTERN = re.compile(r"Unreviewed", re.IGNORECASE)
+    DESCRIPTION_PATTERN = re.compile(r"(RecName|SubName): Full=(.*)")
+    ECO_PATTERN = re.compile(r"\s*\{ECO:.*?\}")
+    EC_PATTERN = re.compile(r"EC=([^;]+)")
+    SEQUENCE_PATTERN = re.compile(r"^SEQUENCE")
+    WHITESPACE_PATTERN = re.compile(r"\s+")
+    GENE_NAME_PATTERN = re.compile(r"Name=(.*)")
+    SYNONYMS_PATTERN = re.compile(r"Synonyms=(.*)")
+    SYNONYMS_COMMA_PATTERN = re.compile(r"\s*,\s*")
+    DEPENDENTS_PATTERN = re.compile(r"^(GO|UniGene|RGD|CCDS|IPI|UCSC|SGD|HGNC|MGI|VGNC|Orphanet|ArrayExpress|GenomeRNAi|EPD|Xenbase|Reactome|MIM|GeneCards)")
+    STABLE_ID_PATTERN = re.compile(r"\.[0-9]+")
+    PROTEIN_ID_PATTERN = re.compile(r"([^.]+)\.([^.]+)")
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id    = args["source_id"]
-        species_id   = args["species_id"]
-        file         = args["file"]
-        xref_dbi     = args["xref_dbi"]
-        release_file = args["rel_file"]
-        verbose      = args.get("verbose", False)
-        hgnc_file    = args.get("hgnc_file")
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+        release_file = args.get("rel_file")
+        verbose = args.get("verbose", False)
+        hgnc_file = args.get("hgnc_file")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         # Get needed source ids
+        source_ids = self.get_source_ids(xref_dbi, verbose)
+
+        # Parse and set release info
+        self.set_release_info(release_file, source_ids, xref_dbi, verbose)
+
+        result_message = self.create_xrefs(source_ids, species_id, xref_file, xref_dbi, hgnc_file)
+        return 0, result_message
+
+    def get_source_ids(self, dbi: Connection, verbose: bool) -> Dict[str, int]:
+        source_names = {
+            "sp_source_id": ("Uniprot/SWISSPROT", "sequence_mapped"),
+            "sptr_source_id": ("Uniprot/SPTREMBL", "sequence_mapped"),
+            "sptr_non_display_source_id": ("Uniprot/SPTREMBL", "protein_evidence_gt_2"),
+            "sp_direct_source_id": ("Uniprot/SWISSPROT", "direct"),
+            "sptr_direct_source_id": ("Uniprot/SPTREMBL", "direct"),
+            "isoform_source_id": ("Uniprot_isoform", None),
+        }
+
         source_ids = {
-            "sp_source_id": self.get_source_id_for_source_name(
-                "Uniprot/SWISSPROT", xref_dbi, "sequence_mapped"
-            ),
-            "sptr_source_id": self.get_source_id_for_source_name(
-                "Uniprot/SPTREMBL", xref_dbi, "sequence_mapped"
-            ),
-            "sptr_non_display_source_id": self.get_source_id_for_source_name(
-                "Uniprot/SPTREMBL", xref_dbi, "protein_evidence_gt_2"
-            ),
-            "sp_direct_source_id": self.get_source_id_for_source_name(
-                "Uniprot/SWISSPROT", xref_dbi, "direct"
-            ),
-            "sptr_direct_source_id": self.get_source_id_for_source_name(
-                "Uniprot/SPTREMBL", xref_dbi, "direct"
-            ),
-            "isoform_source_id": self.get_source_id_for_source_name(
-                "Uniprot_isoform", xref_dbi
-            ),
+            key: self.get_source_id_for_source_name(name, dbi, type)
+            for key, (name, type) in source_names.items()
         }
 
         if verbose:
-            logging.info(f'SwissProt source ID = {source_ids["sp_source_id"]}')
-            logging.info(f'SpTREMBL source ID = {source_ids["sptr_source_id"]}')
-            logging.info(
-                f'SpTREMBL protein_evidence > 2 source ID = {source_ids["sptr_non_display_source_id"]}'
-            )
-            logging.info(
-                f'SwissProt direct source ID = {source_ids["sp_direct_source_id"]}'
-            )
-            logging.info(
-                f'SpTREMBL direct source ID = {source_ids["sptr_direct_source_id"]}'
-            )
+            for key, value in source_ids.items():
+                logging.info(f'{key} = {value}')
 
-        # Parse and set release info
-        if release_file:
-            sp_release = None
-            sptr_release = None
+        return source_ids
+
+    def set_release_info(self, release_file: str, source_ids: Dict[str, int], dbi: Connection, verbose: bool) -> None:
+        if not release_file:
+            return
+
+        sp_release = None
+        sptr_release = None
 
-            release_io = self.get_filehandle(release_file)
+        with self.get_filehandle(release_file) as release_io:
             for line in release_io:
                 line = line.strip()
                 if not line:
                     continue
 
-                match = re.search(r"(UniProtKB/Swiss-Prot Release .*)", line)
+                match = self.SWISSPROT_RELEASE_PATTERN.search(line)
                 if match:
                     sp_release = match.group(1)
                     if verbose:
                         logging.info(f"Swiss-Prot release is {sp_release}")
                 else:
-                    match = re.search(r"(UniProtKB/TrEMBL Release .*)", line)
+                    match = self.TREMBL_RELEASE_PATTERN.search(line)
                     if match:
                         sptr_release = match.group(1)
                         if verbose:
                             logging.info(f"SpTrEMBL release is {sptr_release}")
 
-            release_io.close()
-
-            # Set releases
-            self.set_release(source_ids["sp_source_id"], sp_release, xref_dbi)
-            self.set_release(source_ids["sptr_source_id"], sptr_release, xref_dbi)
-            self.set_release(
-                source_ids["sptr_non_display_source_id"], sptr_release, xref_dbi
-            )
-            self.set_release(source_ids["sp_direct_source_id"], sp_release, xref_dbi)
-            self.set_release(
-                source_ids["sptr_direct_source_id"], sptr_release, xref_dbi
-            )
-
-        result_message = self.create_xrefs(source_ids, species_id, file, xref_dbi, hgnc_file)
-
-        return 0, result_message
+        # Set releases
+        self.set_release(source_ids["sp_source_id"], sp_release, dbi)
+        self.set_release(source_ids["sptr_source_id"], sptr_release, dbi)
+        self.set_release(source_ids["sptr_non_display_source_id"], sptr_release, dbi)
+        self.set_release(source_ids["sp_direct_source_id"], sp_release, dbi)
+        self.set_release(source_ids["sptr_direct_source_id"], sptr_release, dbi)
 
-    def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, dbi: Connection, hgnc_file: str = None) -> str:
+    def create_xrefs(self, source_ids: Dict[str, int], species_id: int, xref_file: str, dbi: Connection, hgnc_file: str = None) -> str:
         counts = {
             "num_sp": 0,
             "num_sptr": 0,
@@ -120,7 +132,7 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d
         ensembl_derived_protein_count = 0
         count = 0
 
-        # Get sources ids of dependent sources
+        # Get source ids of dependent sources
         dependent_sources = self.get_xref_sources(dbi)
 
         # Extract descriptions from hgnc
@@ -137,71 +149,62 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d
         xrefs = []
 
         # Read file
-        for section in self.get_file_sections(file, "//\n"):
-            if len(section) == 1:
-                continue
-
-            entry = "".join(section)
+        for section in self.get_file_sections(xref_file, "//\n"):
+            entry = self.extract_entry_fields(section)
             xref = {}
 
             # Extract the species taxon id
-            found = 0
-            match = re.search(r"OX\s+[a-zA-Z_]+=([0-9 ,]+).*;", entry)
+            found = False
+            match = self.TAXON_PATTERN.search(entry["OX"][0])
             if match:
                 ox = match.group(1)
                 for taxon_id_from_file in ox.split(", "):
-                    taxon_id_from_file = re.sub(r"\s", "", taxon_id_from_file)
-                    if tax_to_species_id.get(taxon_id_from_file):
-                        found = 1
+                    taxon_id_from_file = taxon_id_from_file.strip()
+                    if tax_to_species_id.get(int(taxon_id_from_file)):
+                        found = True
                         count += 1
 
-            # If no taxon_id's match, skip to next record
+            # If no taxon_id match found, skip to next record
             if not found:
                 continue
 
             # Check for CC (caution) lines containing certain text
             # If sequence is from Ensembl, do not use
-            ensembl_derived_protein = 0
-            if re.search(
-                r"CAUTION: The sequence shown here is derived from an Ensembl", entry
-            ):
-                ensembl_derived_protein = 1
-                ensembl_derived_protein_count += 1
+            ensembl_derived_protein = False
+            for comment in entry.get("CC", []):
+                ensembl_derived_protein = bool(self.CAUTION_PATTERN.search(comment))
+                if ensembl_derived_protein:
+                    ensembl_derived_protein_count += 1
+                    break
 
             # Extract ^AC lines and build list of accessions
-            accessions = []
-            accessions_only = re.findall(r"\nAC\s+(.+)", entry)
-            for accessions_line in accessions_only:
-                for acc in accessions_line.split(";"):
-                    acc = acc.strip()
-                    if acc:
-                        accessions.append(acc)
+            accessions = [acc.strip() for acc in entry["AC"][0].split(";") if acc.strip()]
             accession = accessions[0]
 
             if accession.lower() == "unreviewed":
-                logging.warn(
-                    f"WARNING: entries with accession of {accession} not allowed, will be skipped"
-                )
+                logging.warning(f"WARNING: entries with accession of {accession} not allowed, will be skipped")
                 continue
 
+            # Starting building xref object
             xref["ACCESSION"] = accession
             xref["INFO_TYPE"] = "SEQUENCE_MATCH"
-            xref["SYNONYMS"] = []
-            for i in range(1, len(accessions)):
-                xref["SYNONYMS"].append(accessions[i])
-
-            sp_type = re.search(r"ID\s+(\w+)\s+(\w+)", entry).group(2)
-            protein_evidence_code = re.search(r"PE\s+(\d+)", entry).group(1)
-            version = re.search(r"DT\s+\d+-\w+-\d+, entry version (\d+)", entry).group(
-                1
-            )
-
-            # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
-            if re.search(r"^Reviewed", sp_type, re.IGNORECASE):
+            xref["SYNONYMS"] = accessions[1:]
+
+            # Extract the type, protein evidence code and version
+            sp_type = self.SP_TYPE_PATTERN.search(entry["ID"][0]).group(2)
+            protein_evidence_code = self.PROTEIN_EVIDENCE_PATTERN.search(entry["PE"][0]).group(1)
+            for dt_line in entry.get("DT", []):
+                match = self.VERSION_PATTERN.search(dt_line)
+                if match:
+                    version = match.group(1)
+                    break
+
+            # SwissProt/SPTrEMBL are differentiated by having Reviewed/Unreviewed here
+            if self.REVIEWED_PATTERN.search(sp_type):
                 xref["SOURCE_ID"] = source_ids["sp_source_id"]
                 counts["num_sp"] += 1
-            elif re.search(r"Unreviewed", sp_type, re.IGNORECASE):
-                # Use normal source only if it is PE levels 1 & 2
+            elif self.UNREVIEWED_PATTERN.search(sp_type):
+                # Use normal source only if PE levels 1 & 2
                 if protein_evidence_code and int(protein_evidence_code) < 3:
                     xref["SOURCE_ID"] = source_ids["sptr_source_id"]
                     counts["num_sptr"] += 1
@@ -220,194 +223,123 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d
             xref["DEPENDENT_XREFS"] = []
             xref["DIRECT_XREFS"] = []
 
-            # Extract ^DE lines only and build cumulative description string
-            description = ""
-            description_lines = re.findall(r"\nDE\s+(.+)", entry)
-            for line in description_lines:
-                match = re.search(r"RecName: Full=(.*);", line)
-                if match:
-                    if description:
-                        description += "; "
-                    description += match.group(1)
-                else:
-                    match = re.search(r"SubName: Full=(.*);", line)
-                    if match:
-                        if description:
-                            description += "; "
-                        description += match.group(1)
-
-                description = re.sub(r"^\s*", "", description)
-                description = re.sub(r"\s*$", "", description)
-                description = re.sub(r"\s*\{ECO:.*?\}", "", description)
-
-                # Parse the EC_NUMBER line, only for S.cerevisiae for now
-                if re.search(r"EC=", line) and species_id == "4932":
-                    # Get the EC Number and make it an xref for S.cer if any
-                    EC = re.search(r"\s*EC=([^;]+);", line).group(1)
-
-                    dependent = {}
-                    dependent["LABEL"] = EC
-                    dependent["ACCESSION"] = EC
-                    dependent["SOURCE_NAME"] = "EC_NUMBER"
-                    dependent["SOURCE_ID"] = dependent_sources["EC_NUMBER"]
-                    dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
-                    xref["DEPENDENT_XREFS"].append(dependent)
-                    dependent_xrefs_counts["EC_NUMBER"] = (
-                        dependent_xrefs_counts.get("EC_NUMBER", 0) + 1
-                    )
-
+            # Extract the description
+            description, ec_number = self.extract_description("".join(entry.get("DE", [])))
             xref["DESCRIPTION"] = description
 
+            # Parse the EC_NUMBER, only for S.cerevisiae for now
+            if ec_number and species_id == 4932:
+                dependent = {}
+                dependent["LABEL"] = ec_number
+                dependent["ACCESSION"] = ec_number
+                dependent["SOURCE_NAME"] = "EC_NUMBER"
+                dependent["SOURCE_ID"] = dependent_sources["EC_NUMBER"]
+                dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
+                xref["DEPENDENT_XREFS"].append(dependent)
+                dependent_xrefs_counts["EC_NUMBER"] = (dependent_xrefs_counts.get("EC_NUMBER", 0) + 1)
+
             # Extract sequence
-            sequence = re.search(r"SQ\s+(.+)", entry, flags=re.DOTALL).group(1)
-            sequence = re.sub(r"\n", "", sequence)
-            sequence = re.sub(r"\/\/", "", sequence)
-            sequence = re.sub(r"\s", "", sequence)
-            sequence = re.sub(r"^.*;", "", sequence)
+            sequence = ""
+            for seq_line in entry.get("SQ", []):
+                if not self.SEQUENCE_PATTERN.search(seq_line):
+                    sequence += seq_line
+            sequence = self.WHITESPACE_PATTERN.sub("", sequence)
             xref["SEQUENCE"] = sequence
 
             # Extract gene names
-            gene_names = re.findall(r"\nGN\s+(.+)", entry)
-            gene_names = " ".join(gene_names).split(";")
-
-            # Do not allow the addition of UniProt Gene Name dependent Xrefs
-            # if the protein was imported from Ensembl. Otherwise we will
-            # re-import previously set symbols
-            if not ensembl_derived_protein:
-                dependent = {}
-                name_found = 0
-                gene_name = None
-                dep_synonyms = []
-                for line in gene_names:
-                    line = line.strip()
-
-                    if not re.search(r"Name=", line) and not re.search(
-                        r"Synonyms=", line
-                    ):
-                        continue
-
-                    match = re.search(r"Name=([A-Za-z0-9_\-\.\s]+)", line)
-                    if match and not name_found:
-                        gene_name = match.group(1).rstrip()
-                        gene_name = re.sub(r"\nGN", "", gene_name)
-                        name_found = 1
-
-                    match = re.search(r"Synonyms=(.*)", line)
-                    if match:
-                        synonym = match.group(1)
-                        synonym = re.sub(r"\{.*?\}", "", synonym)
-                        synonym = re.sub(r"\s+$", "", synonym)
-                        synonym = re.sub(r"\s*,\s*", ",", synonym)
-                        synonyms = synonym.split(",")
-                        for synonym in synonyms:
-                            if synonym not in dep_synonyms:
-                                dep_synonyms.append(synonym)
+            if not ensembl_derived_protein and entry.get("GN"):
+                gene_name, gene_synonyms = self.extract_gene_name(" ".join(entry["GN"]))
 
+                # Add dependent xref for gene name
                 if gene_name:
+                    dependent = {}
                     dependent["LABEL"] = gene_name
                     dependent["ACCESSION"] = xref["ACCESSION"]
                     dependent["SOURCE_NAME"] = "Uniprot_gn"
                     dependent["SOURCE_ID"] = dependent_sources["Uniprot_gn"]
                     dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
-                    dependent["SYNONYMS"] = dep_synonyms
-                    if hgnc_file and hgnc_descriptions.get(gene_name) is not None:
+                    dependent["SYNONYMS"] = gene_synonyms
+                    if hgnc_file and hgnc_descriptions.get(gene_name):
                         dependent["DESCRIPTION"] = hgnc_descriptions[gene_name]
                     xref["DEPENDENT_XREFS"].append(dependent)
-                    dependent_xrefs_counts["Uniprot_gn"] = (
-                        dependent_xrefs_counts.get("Uniprot_gn", 0) + 1
-                    )
+                    dependent_xrefs_counts["Uniprot_gn"] = dependent_xrefs_counts.get("Uniprot_gn", 0) + 1
 
             # Dependent xrefs - only store those that are from sources listed in the source table
-            deps = re.findall(r"\n(DR\s+.+)", entry)
-
             seen = {}
-            for dep in deps:
-                match = re.search(r"^DR\s+(.+)", dep)
-                if match:
-                    vals = re.split(r";\s*", match.group(1))
-                    source = vals[0]
-                    acc = vals[1]
-                    extra = []
-                    if len(vals) > 2:
-                        extra = vals[2 : len(vals)]
-
-                    # Skip external sources obtained through other files
-                    if re.search(
-                        r"^(GO|UniGene|RGD|CCDS|IPI|UCSC|SGD|HGNC|MGI|VGNC|Orphanet|ArrayExpress|GenomeRNAi|EPD|Xenbase|Reactome|MIM|GeneCards)",
-                        source,
-                    ):
-                        continue
-
-                    # If mapped to Ensembl, add as direct xref
-                    if source == "Ensembl":
-                        direct = {}
-                        isoform = {}
-
-                        stable_id = extra[0]
-                        stable_id = re.sub(r"\.[0-9]+", "", stable_id)
-                        direct["STABLE_ID"] = stable_id
-                        direct["ENSEMBL_TYPE"] = "Translation"
-                        direct["LINKAGE_TYPE"] = "DIRECT"
-                        if xref["SOURCE_ID"] == source_ids["sp_source_id"]:
-                            direct["SOURCE_ID"] = source_ids["sp_direct_source_id"]
-                            counts["num_direct_sp"] += 1
-                        else:
-                            direct["SOURCE_ID"] = source_ids["sptr_direct_source_id"]
-                            counts["num_direct_sptr"] += 1
-                        xref["DIRECT_XREFS"].append(direct)
-
-                        match = re.search(r"(%s-[0-9]+)" % accession, extra[1])
-                        if match:
-                            isoform = match.group(1)
-                            self.add_to_direct_xrefs(
-                                {
-                                    "stable_id": stable_id,
-                                    "ensembl_type": "translation",
-                                    "accession": isoform,
-                                    "label": isoform,
-                                    "source_id": source_ids["isoform_source_id"],
-                                    "linkage": "DIRECT",
-                                    "species_id": species_id,
-                                },
-                                dbi,
-                            )
-                            counts["num_isoform"] += 1
-
-                    # Create dependent xref structure & store it
-                    if dependent_sources.get(source):
-                        dependent = {}
-
-                        dependent["SOURCE_NAME"] = source
-                        dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
-                        dependent["SOURCE_ID"] = dependent_sources[source]
-                        dependent["ACCESSION"] = acc
-
-                        if not seen.get(f"{source}:{acc}"):
+            for dependent_line in entry.get("DR", []):
+                vals = re.split(r";\s*", dependent_line)
+                source = vals[0]
+                dependent_acc = vals[1]
+                extra = vals[2:] if len(vals) > 2 else []
+
+                # Skip external sources obtained through other files
+                if self.DEPENDENTS_PATTERN.search(source):
+                    continue
+
+                # If mapped to Ensembl, add as direct xref
+                if source == "Ensembl":
+                    stable_id = self.STABLE_ID_PATTERN.sub("", extra[0])
+
+                    direct = {}
+                    direct["STABLE_ID"] = stable_id
+                    direct["ENSEMBL_TYPE"] = "Translation"
+                    direct["LINKAGE_TYPE"] = "DIRECT"
+                    if xref["SOURCE_ID"] == source_ids["sp_source_id"]:
+                        direct["SOURCE_ID"] = source_ids["sp_direct_source_id"]
+                        counts["num_direct_sp"] += 1
+                    else:
+                        direct["SOURCE_ID"] = source_ids["sptr_direct_source_id"]
+                        counts["num_direct_sptr"] += 1
+                    xref["DIRECT_XREFS"].append(direct)
+
+                    match = re.search(r"(%s-[0-9]+)" % accession, extra[1])
+                    if match:
+                        isoform = match.group(1)
+
+                        xref_id = self.add_xref(
+                            {
+                                "accession": isoform,
+                                "label": isoform,
+                                "source_id": source_ids["isoform_source_id"],
+                                "species_id": species_id,
+                                "info_type": "DIRECT",
+                            },
+                            dbi,
+                        )
+                        self.add_direct_xref(xref_id, stable_id, "translation", "DIRECT", dbi)
+                        counts["num_isoform"] += 1
+
+                # Create dependent xref structure & store it
+                if dependent_sources.get(source):
+                    # Only add depenedent accession once for record
+                    if not seen.get(f"{source}:{dependent_acc}"):
+                        dependent = {
+                            "SOURCE_NAME": source,
+                            "LINKAGE_SOURCE_ID": xref["SOURCE_ID"],
+                            "SOURCE_ID": dependent_sources[source],
+                            "ACCESSION": dependent_acc,
+                        }
+
+                        xref["DEPENDENT_XREFS"].append(dependent)
+                        dependent_xrefs_counts[source] = dependent_xrefs_counts.get(source, 0) + 1
+                        seen[f"{source}:{dependent_acc}"] = True
+
+                    # For EMBL source, add protein_id as dependent xref
+                    if source == "EMBL":
+                        protein_id = extra[0]
+                        if protein_id != "-" and not seen.get(f"{source}:{protein_id}"):
+                            protein_id_acc = self.PROTEIN_ID_PATTERN.search(protein_id).group(1)
+                            dependent = {
+                                "SOURCE_NAME": source,
+                                "SOURCE_ID": dependent_sources["protein_id"],
+                                "LINKAGE_SOURCE_ID": xref["SOURCE_ID"],
+                                "LABEL": protein_id,
+                                "ACCESSION": protein_id_acc,
+                            }
+
                             xref["DEPENDENT_XREFS"].append(dependent)
-                            dependent_xrefs_counts[source] = (
-                                dependent_xrefs_counts.get(source, 0) + 1
-                            )
-                            seen[f"{source}:{acc}"] = 1
-
-                        if re.search(r"EMBL", dep) and not re.search(r"ChEMBL", dep):
-                            protein_id = extra[0]
-                            if protein_id != "-" and not seen.get(
-                                f"{source}:{protein_id}"
-                            ):
-                                dependent = {}
-
-                                dependent["SOURCE_NAME"] = source
-                                dependent["SOURCE_ID"] = dependent_sources["protein_id"]
-                                dependent["LINKAGE_SOURCE_ID"] = xref["SOURCE_ID"]
-                                dependent["LABEL"] = protein_id
-                                dependent["ACCESSION"] = re.search(
-                                    r"([^.]+)\.([^.]+)", protein_id
-                                ).group(1)
-                                xref["DEPENDENT_XREFS"].append(dependent)
-                                dependent_xrefs_counts[source] = (
-                                    dependent_xrefs_counts.get(source, 0) + 1
-                                )
-                                seen[f"{source}:{protein_id}"] = 1
+                            dependent_xrefs_counts["protein_id"] = dependent_xrefs_counts.get("protein_id", 0) + 1
+                            seen[f"{source}:{protein_id}"] = True
 
             xrefs.append(xref)
 
@@ -416,26 +348,101 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, file: str, d
                 count = 0
                 xrefs.clear()
 
-        if len(xrefs) > 0:
+        if xrefs:
             self.upload_xref_object_graphs(xrefs, dbi)
 
-        result_message = f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, and {counts["num_sptr_non_display"]} SPTrEMBL xrefs with protein evidence codes > 2 from {file}\n'
-        result_message += f'Added {counts["num_direct_sp"]} direct SwissProt xrefs and {counts["num_direct_sptr"]} direct SPTrEMBL xrefs\n'
-        result_message += f'Added {counts["num_isoform"]} direct isoform xrefs\n'
-        result_message += f"Skipped {ensembl_derived_protein_count} ensembl annotations as Gene names\n"
-
-        result_message += f"Added the following dependent xrefs:\n"
+        result_message = (
+            f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, '
+            f'and {counts["num_sptr_non_display"]} SPTrEMBL xrefs with protein evidence codes > 2 from {xref_file}\n'
+            f'Added {counts["num_direct_sp"]} direct SwissProt xrefs and {counts["num_direct_sptr"]} direct SPTrEMBL xrefs\n'
+            f'Added {counts["num_isoform"]} direct isoform xrefs\n'
+            f'Skipped {ensembl_derived_protein_count} ensembl annotations as Gene names\n'
+            f'Added the following dependent xrefs:\n'
+        )
         for xref_source, xref_count in dependent_xrefs_counts.items():
             result_message += f"\t{xref_source}\t{xref_count}\n"
 
         return result_message
 
+    def extract_entry_fields(self, section: str) -> Dict[str, List[str]]:
+        entry_dict = {}
+        in_sq_section = False
+
+        for line in section:
+            line = line.strip()
+            if not line:
+                continue
+
+            line_key = line[:2]
+            clean_line = line[2:].strip()
+
+            if line_key == "SQ":
+                in_sq_section = True
+            elif in_sq_section:
+                line_key = "SQ"
+                clean_line = line
+
+            entry_dict.setdefault(line_key, []).append(clean_line)
+
+        return entry_dict
+    
+    def extract_description(self, full_description: str) -> Tuple[str, str]:
+        descriptions = []
+        ec_number = None
+        description = ""
+
+        description_lines = full_description.split(";")
+        for line in description_lines:
+            if not line.strip():
+                continue
+
+            match = self.DESCRIPTION_PATTERN.search(line)
+            if match:
+                descriptions.append(match.group(2))
+            
+            # Get the EC number, if present
+            match = self.EC_PATTERN.search(line)
+            if match:
+                ec_number = match.group(1)
+                ec_number = self.ECO_PATTERN.sub("", ec_number).strip()
+
+        if descriptions:
+            description = "; ".join(descriptions)
+            description = self.ECO_PATTERN.sub("", description).strip()
+
+        return description, ec_number
+    
+    def extract_gene_name(self, full_gene_names: str) -> Tuple[str, List[str]]:
+        name_found = False
+        gene_name = None
+        synonyms_list = []
+
+        gene_name_lines = full_gene_names.split(";")
+        for line in gene_name_lines:
+            if not line.strip():
+                continue
+
+            match = self.GENE_NAME_PATTERN.search(line)
+            if match and not name_found:
+                gene_name = match.group(1)
+                gene_name = self.ECO_PATTERN.sub("", gene_name).strip()
+                name_found = True
+            
+            match = self.SYNONYMS_PATTERN.search(line)
+            if match:
+                synonyms = match.group(1)
+                synonyms = self.ECO_PATTERN.sub("", synonyms).strip()
+                synonyms = self.SYNONYMS_COMMA_PATTERN.sub(",", synonyms)
+                synonyms_list = synonyms.split(",")
+        
+        return gene_name, synonyms_list
+
     def get_hgnc_descriptions(self, hgnc_file: str) -> Dict[str, str]:
         descriptions = {}
 
         # Make sure the file is utf8
         hgnc_file = codecs.encode(hgnc_file, "utf-8").decode("utf-8")
-        hgnc_file = re.sub(r'"', '', hgnc_file)
+        hgnc_file = re.sub(r'"', "", hgnc_file)
 
         hgnc_io = self.get_filehandle(hgnc_file)
         csv_reader = csv.DictReader(hgnc_io, delimiter="\t")
@@ -449,4 +456,4 @@ def get_hgnc_descriptions(self, hgnc_file: str) -> Dict[str, str]:
 
         hgnc_io.close()
 
-        return descriptions
\ No newline at end of file
+        return descriptions
diff --git a/src/python/ensembl/production/xrefs/parsers/VGNCParser.py b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py
index 21cb13d58..ffab8f1c2 100644
--- a/src/python/ensembl/production/xrefs/parsers/VGNCParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/VGNCParser.py
@@ -14,18 +14,51 @@
 
 """Parser module for VGNC source (uses HGNC Parser as parent)."""
 
-from ensembl.production.xrefs.parsers.HGNCParser import *
+import csv
+from typing import Dict, Any, Tuple
+from sqlalchemy.engine import Connection
 
+from ensembl.production.xrefs.parsers.HGNCParser import HGNCParser
 
 class VGNCParser(HGNCParser):
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
+
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
+
+        # Open the VGNC file
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"VGNC file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.DictReader(file_io, delimiter="\t")
+
+            # Check if header has required columns
+            required_columns = [
+                "taxon_id",
+                "ensembl_gene_id",
+                "vgnc_id",
+                "symbol",
+                "name",
+                "alias_symbol",
+                "prev_symbol",
+            ]
+            if not set(required_columns).issubset(set(csv_reader.fieldnames)):
+                raise ValueError(f"Can't find required columns in VGNC file '{xref_file}'")
+
+            count, syn_count = self.process_lines(csv_reader, source_id, species_id, xref_dbi)
+
+        result_message = f"Loaded a total of {count} VGNC xrefs and added {syn_count} synonyms"
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        return 0, result_message
+
+    def process_lines(self, csv_reader: csv.DictReader, source_id: int, species_id: int, xref_dbi: Connection) -> Tuple[int, int]:
+        count, syn_count = 0, 0
 
         # Create a hash of all valid taxon_ids for this species
         species_id_to_tax = self.species_id_to_taxonomy(xref_dbi)
@@ -34,46 +67,30 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         tax_ids = species_id_to_tax[species_id]
         tax_to_species_id = {tax_id: species_id for tax_id in tax_ids}
 
-        # Open the vgnc file
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.DictReader(file_io, delimiter="\t")
-
-        # Check if header has required columns
-        required_columns = [
-            "taxon_id",
-            "ensembl_gene_id",
-            "vgnc_id",
-            "symbol",
-            "name",
-            "alias_symbol",
-            "prev_symbol",
-        ]
-        if not set(required_columns).issubset(set(csv_reader.fieldnames)):
-            raise IOError(f"Can't find required columns in VGNC file '{file}'")
-
         # Read lines
-        count = 0
         for line in csv_reader:
+            tax_id = int(line["taxon_id"])
             # Skip data for other species
-            if not tax_to_species_id.get(line["taxon_id"]):
+            if not tax_to_species_id.get(tax_id):
                 continue
 
-            # Add ensembl direct xref
+            # Add Ensembl direct xref
             if line["ensembl_gene_id"]:
-                self.add_to_direct_xrefs(
+                xref_id = self.add_xref(
                     {
-                        "stable_id": line["ensembl_gene_id"],
-                        "ensembl_type": "gene",
                         "accession": line["vgnc_id"],
                         "label": line["symbol"],
                         "description": line["name"],
                         "source_id": source_id,
                         "species_id": species_id,
+                        "info_type": "DIRECT",
                     },
                     xref_dbi,
                 )
-
-                self.add_synonyms_for_hgnc(
+                self.add_direct_xref(xref_id, line["ensembl_gene_id"], "gene", "", xref_dbi)
+                
+                # Add synonyms
+                syn_count += self.add_synonyms_for_hgnc(
                     {
                         "source_id": source_id,
                         "name": line["vgnc_id"],
@@ -85,9 +102,5 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 )
 
                 count += 1
-
-        file_io.close()
-
-        result_message = f"Loaded a total of {count} VGNC xrefs"
-
-        return 0, result_message
+        
+        return count, syn_count
\ No newline at end of file
diff --git a/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py
index 38c8ccbda..91d37397b 100644
--- a/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/XenopusJamboreeParser.py
@@ -14,63 +14,67 @@
 
 """Parser module for Xenbase source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import csv
+import re
+from typing import Any, Dict, Tuple
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class XenopusJamboreeParser(BaseParser):
+    DESC_PROVENANCE_PATTERN = re.compile(r"\s*\[.*\]", re.IGNORECASE | re.DOTALL)
+    DESC_LABEL_PATTERN = re.compile(r",\s+\d+\s+of\s+\d+", re.IGNORECASE | re.DOTALL)
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         count = 0
 
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.reader(file_io, delimiter="\t")
-
-        # Read lines
-        for line in csv_reader:
-            accession = line[0]
-            label = line[1]
-            desc = line[2]
-            stable_id = line[3]
-
-            # If there is a description, trim it a bit
-            if desc:
-                desc = self.parse_description(desc)
-
-            if label == "unnamed":
-                label = accession
-
-            self.add_to_direct_xrefs(
-                {
-                    "stable_id": stable_id,
-                    "ensembl_type": "gene",
-                    "accession": accession,
-                    "label": label,
-                    "description": desc,
-                    "source_id": source_id,
-                    "species_id": species_id,
-                },
-                xref_dbi,
-            )
-            count += 1
-
-        file_io.close()
-
-        result_message = f"{count} XenopusJamboreeParser xrefs succesfully parsed"
-
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"XenopusJamboree file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.reader(file_io, delimiter="\t")
+
+            # Read lines
+            for line in csv_reader:
+                accession, label, desc, stable_id = line[:4]
+
+                # If there is a description, trim it a bit
+                if desc:
+                    desc = self.parse_description(desc)
+
+                if label == "unnamed":
+                    label = accession
+
+                xref_id = self.add_xref(
+                    {
+                        "accession": accession,
+                        "label": label,
+                        "description": desc,
+                        "source_id": source_id,
+                        "species_id": species_id,
+                        "info_type": "DIRECT",
+                    },
+                    xref_dbi,
+                )
+                self.add_direct_xref(xref_id, stable_id, "gene", "", xref_dbi)
+                count += 1
+
+        result_message = f"{count} XenopusJamboree xrefs successfully parsed"
         return 0, result_message
 
     def parse_description(self, description: str) -> str:
         # Remove some provenance information encoded in the description
-        description = re.sub(r"\s*\[.*\]", "", description)
+        description = self.DESC_PROVENANCE_PATTERN.sub("", description)
 
         # Remove labels of type 5 of 14 from the description
-        description = re.sub(r",\s+\d+\s+of\s+\d+", "", description)
+        description = self.DESC_LABEL_PATTERN.sub("", description)
 
         return description
diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py
index 4e703788a..670a03dcc 100644
--- a/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ZFINDescParser.py
@@ -14,49 +14,53 @@
 
 """Parser module for ZFIN Descriptions."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import csv
+import re
+from typing import Any, Dict, Tuple
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class ZFINDescParser(BaseParser):
+    WITHDRAWN_PATTERN = re.compile(r"^WITHDRAWN:", re.IGNORECASE | re.DOTALL)
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         count = 0
         withdrawn = 0
 
-        file_io = self.get_filehandle(file)
-        csv_reader = csv.DictReader(file_io, delimiter="\t")
-        csv_reader.fieldnames = ["zfin", "desc", "label", "extra1", "extra2"]
-
-        # Read lines
-        for line in csv_reader:
-            # Skip if WITHDRAWN: this precedes both desc and label
-            if re.search(r"\A WITHDRAWN:", line["label"]):
-                withdrawn += 1
-            else:
-                xref_id = self.add_xref(
-                    {
-                        "accession": line["zfin"],
-                        "label": line["label"],
-                        "description": line["desc"],
-                        "source_id": source_id,
-                        "species_id": species_id,
-                        "info_type": "MISC",
-                    },
-                    xref_dbi,
-                )
-                count += 1
-
-        file_io.close()
-
-        result_message = (
-            f"{count} ZFINDesc xrefs added, {withdrawn} withdrawn entries ignored"
-        )
+        with self.get_filehandle(xref_file) as file_io:
+            if file_io.read(1) == '':
+                raise IOError(f"ZFINDesc file is empty")
+            file_io.seek(0)
+
+            csv_reader = csv.DictReader(file_io, delimiter="\t")
+            csv_reader.fieldnames = ["zfin", "desc", "label", "extra1", "extra2"]
+
+            # Read lines
+            for line in csv_reader:
+                # Skip if WITHDRAWN: this precedes both desc and label
+                if self.WITHDRAWN_PATTERN.search(line["label"]):
+                    withdrawn += 1
+                else:
+                    self.add_xref(
+                        {
+                            "accession": line["zfin"],
+                            "label": line["label"],
+                            "description": line["desc"],
+                            "source_id": source_id,
+                            "species_id": species_id,
+                            "info_type": "MISC",
+                        },
+                        xref_dbi,
+                    )
+                    count += 1
 
+        result_message = f"{count} ZFINDesc xrefs added, {withdrawn} withdrawn entries ignored"
         return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
index 8734d62ca..2792af8ff 100644
--- a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
@@ -14,127 +14,134 @@
 
 """Parser module for ZFIN source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import os
+import csv
+import re
+import unicodedata
+from typing import Dict, Any, Tuple
+from sqlalchemy import select
 
+from ensembl.xrefs.xref_update_db_model import (
+    Source as SourceUORM,
+    Xref as XrefUORM,
+)
+
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class ZFINParser(BaseParser):
+    REFSEQ_ACC_PATTERN = re.compile(r"^X[PMR]_")
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id  = args["source_id"]
-        species_id = args["species_id"]
-        file       = args["file"]
-        xref_dbi   = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        xref_file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
-        if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+        if not source_id or not species_id or not xref_file:
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         # Get the ZFIN source ids
-        direct_src_id = self.get_source_id_for_source_name(
-            "ZFIN_ID", xref_dbi, "direct"
-        )
-        dependent_src_id = self.get_source_id_for_source_name(
-            "ZFIN_ID", xref_dbi, "uniprot/refseq"
-        )
-        description_src_id = self.get_source_id_for_source_name(
-            "ZFIN_ID", xref_dbi, "description_only"
-        )
+        direct_src_id = self.get_source_id_for_source_name("ZFIN_ID", xref_dbi, "direct")
+        dependent_src_id = self.get_source_id_for_source_name("ZFIN_ID", xref_dbi, "uniprot/refseq")
+        description_src_id = self.get_source_id_for_source_name("ZFIN_ID", xref_dbi, "description_only")
 
         # Get the ZFIN descriptions
-        description = {}
-        query = select(XrefUORM.accession, XrefUORM.description).where(
-            XrefUORM.source_id == description_src_id
-        )
+        descriptions = {}
+        query = select(XrefUORM.accession, XrefUORM.description).where(XrefUORM.source_id == description_src_id)
         for row in xref_dbi.execute(query).mappings().all():
             if row.description:
-                description[row.accession] = row.description
+                descriptions[row.accession] = row.description
 
         # Get the Uniprot and RefSeq accessions
         swiss = self.get_valid_codes("uniprot/swissprot", species_id, xref_dbi)
         refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
 
-        file_dir = os.path.dirname(file)
+        file_dir = os.path.dirname(xref_file)
         counts = {"direct": 0, "uniprot": 0, "refseq": 0, "synonyms": 0, "mismatch": 0}
 
         # Process ZFIN to ensEMBL mappings
         zfin = {}
-        zfin_io = self.get_filehandle(os.path.join(file_dir, "ensembl_1_to_1.txt"))
-        zfin_csv_reader = csv.DictReader(zfin_io, delimiter="\t", strict=True)
-        zfin_csv_reader.fieldnames = ["zfin", "so", "label", "ensembl_id"]
-        for line in zfin_csv_reader:
-            self.add_to_direct_xrefs(
-                {
-                    "stable_id": line["ensembl_id"],
-                    "ensembl_type": "gene",
-                    "accession": line["zfin"],
-                    "label": line["label"],
-                    "description": description.get(line["zfin"]),
-                    "source_id": direct_src_id,
-                    "species_id": species_id,
-                },
-                xref_dbi,
-            )
-
-            zfin[line["zfin"]] = 1
-            counts["direct"] += 1
-
-        zfin_io.close()
+        with self.get_filehandle(os.path.join(file_dir, "ensembl_1_to_1.txt")) as zfin_io:
+            if zfin_io.read(1) == '':
+                raise IOError(f"ZFIN Ensembl file is empty")
+            zfin_io.seek(0)
+
+            zfin_csv_reader = csv.DictReader(zfin_io, delimiter="\t", strict=True)
+            zfin_csv_reader.fieldnames = ["zfin", "so", "label", "ensembl_id"]
+            for line in zfin_csv_reader:
+                xref_id = self.add_xref(
+                    {
+                        "accession": line["zfin"],
+                        "label": line["label"],
+                        "description": descriptions.get(line["zfin"]),
+                        "source_id": direct_src_id,
+                        "species_id": species_id,
+                        "info_type": "DIRECT",
+                    },
+                    xref_dbi,
+                )
+                self.add_direct_xref(xref_id, line["ensembl_id"], "gene", "", xref_dbi)
 
-        # Process ZFIN to Uniprot mappings
-        swissprot_io = self.get_filehandle(os.path.join(file_dir, "uniprot.txt"))
-        swissprot_csv_reader = csv.DictReader(swissprot_io, delimiter="\t", strict=True)
-        swissprot_csv_reader.fieldnames = ["zfin", "so", "label", "acc"]
-        for line in swissprot_csv_reader:
-            if swiss.get(line["acc"]) and not zfin.get(line["zfin"]):
-                for xref_id in swiss[line["acc"]]:
-                    self.add_dependent_xref(
-                        {
-                            "master_xref_id": xref_id,
-                            "accession": line["zfin"],
-                            "label": line["label"],
-                            "description": description.get(line["zfin"]),
-                            "source_id": dependent_src_id,
-                            "species_id": species_id,
-                        },
-                        xref_dbi,
-                    )
-                    counts["uniprot"] += 1
-            else:
-                counts["mismatch"] += 1
+                zfin[line["zfin"]] = True
+                counts["direct"] += 1
 
-        swissprot_io.close()
+        # Process ZFIN to Uniprot mappings
+        with self.get_filehandle(os.path.join(file_dir, "uniprot.txt")) as swissprot_io:
+            if swissprot_io.read(1) == '':
+                raise IOError(f"ZFIN Uniprot file is empty")
+            swissprot_io.seek(0)
+
+            swissprot_csv_reader = csv.DictReader(swissprot_io, delimiter="\t", strict=True)
+            swissprot_csv_reader.fieldnames = ["zfin", "so", "label", "acc"]
+            for line in swissprot_csv_reader:
+                if swiss.get(line["acc"]) and not zfin.get(line["zfin"]):
+                    for xref_id in swiss[line["acc"]]:
+                        self.add_dependent_xref(
+                            {
+                                "master_xref_id": xref_id,
+                                "accession": line["zfin"],
+                                "label": line["label"],
+                                "description": descriptions.get(line["zfin"]),
+                                "source_id": dependent_src_id,
+                                "species_id": species_id,
+                            },
+                            xref_dbi,
+                        )
+                        counts["uniprot"] += 1
+                else:
+                    counts["mismatch"] += 1
 
         # Process ZFIN to RefSeq mappings
-        refseq_io = self.get_filehandle(os.path.join(file_dir, "refseq.txt"))
-        refseq_csv_reader = csv.DictReader(refseq_io, delimiter="\t", strict=True)
-        refseq_csv_reader.fieldnames = ["zfin", "so", "label", "acc"]
-        for line in refseq_csv_reader:
-            # Ignore mappings to predicted RefSeq
-            if (
-                re.search(r"^XP_", line["acc"])
-                or re.search(r"^XM_", line["acc"])
-                or re.search(r"^XR_", line["acc"])
-            ):
-                continue
-
-            if refseq.get(line["acc"]) and not zfin.get(line["zfin"]):
-                for xref_id in refseq[line["acc"]]:
-                    self.add_dependent_xref(
-                        {
-                            "master_xref_id": xref_id,
-                            "accession": line["zfin"],
-                            "label": line["label"],
-                            "description": description.get(line["zfin"]),
-                            "source_id": source_id,
-                            "species_id": species_id,
-                        },
-                        xref_dbi,
-                    )
-                    counts["refseq"] += 1
-            else:
-                counts["mismatch"] += 1
-
-        refseq_io.close()
-
-        # Get the added ZFINs added
+        with self.get_filehandle(os.path.join(file_dir, "refseq.txt")) as refseq_io:
+            if refseq_io.read(1) == '':
+                raise IOError(f"ZFIN Refseq file is empty")
+            refseq_io.seek(0)
+
+            refseq_csv_reader = csv.DictReader(refseq_io, delimiter="\t", strict=True)
+            refseq_csv_reader.fieldnames = ["zfin", "so", "label", "acc"]
+            for line in refseq_csv_reader:
+                # Ignore mappings to predicted RefSeq
+                if self.REFSEQ_ACC_PATTERN.search(line["acc"]):
+                    continue
+
+                if refseq.get(line["acc"]) and not zfin.get(line["zfin"]):
+                    for xref_id in refseq[line["acc"]]:
+                        self.add_dependent_xref(
+                            {
+                                "master_xref_id": xref_id,
+                                "accession": line["zfin"],
+                                "label": line["label"],
+                                "description": descriptions.get(line["zfin"]),
+                                "source_id": dependent_src_id,
+                                "species_id": species_id,
+                            },
+                            xref_dbi,
+                        )
+                        counts["refseq"] += 1
+                else:
+                    counts["mismatch"] += 1
+
+        # Get the added ZFINs
         zfin = self.get_valid_codes("zfin", species_id, xref_dbi)
 
         sources = []
@@ -143,27 +150,31 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
             sources.append(row[0])
 
         # Process the synonyms
-        aliases_io = self.get_filehandle(os.path.join(file_dir, "aliases.txt"))
-        aliases_csv_reader = csv.DictReader(aliases_io, delimiter="\t", strict=True)
-        aliases_csv_reader.fieldnames = ["acc", "cur_name", "cur_symbol", "syn", "so"]
-        for line in aliases_csv_reader:
-            if zfin.get(line["acc"]):
-                synonym = (
-                    unicodedata.normalize("NFKD", line["syn"])
-                    .encode("ascii", "namereplace")
-                    .decode("ascii")
-                )
-                self.add_to_syn_for_mult_sources(
-                    line["acc"], sources, synonym, species_id, xref_dbi
-                )
-                counts["synonyms"] += 1
-
-        aliases_io.close()
-
-        result_message = f"{counts['direct']} direct ZFIN xrefs added and\n"
-        result_message += f"\t{counts['uniprot']} dependent xrefs from UniProt added\n"
-        result_message += f"\t{counts['refseq']} dependent xrefs from RefSeq added\n"
-        result_message += f"\t{counts['mismatch']} dependents ignored\n"
-        result_message += f"\t{counts['synonyms']} synonyms loaded"
+        with self.get_filehandle(os.path.join(file_dir, "aliases.txt")) as aliases_io:
+            if aliases_io.read(1) == '':
+                raise IOError(f"ZFIN Aliases file is empty")
+            aliases_io.seek(0)
+
+            aliases_csv_reader = csv.DictReader(aliases_io, delimiter="\t", strict=True)
+            aliases_csv_reader.fieldnames = ["acc", "cur_name", "cur_symbol", "syn", "so"]
+            for line in aliases_csv_reader:
+                if zfin.get(line["acc"]):
+                    synonym = (
+                        unicodedata.normalize("NFKD", line["syn"])
+                        .encode("ascii", "namereplace")
+                        .decode("ascii")
+                    )
+                    self.add_to_syn_for_mult_sources(
+                        line["acc"], sources, synonym, species_id, xref_dbi
+                    )
+                    counts["synonyms"] += 1
+
+        result_message = (
+            f"{counts['direct']} direct ZFIN xrefs added and\n"
+            f"\t{counts['uniprot']} dependent xrefs from UniProt added\n"
+            f"\t{counts['refseq']} dependent xrefs from RefSeq added\n"
+            f"\t{counts['mismatch']} dependents ignored\n"
+            f"\t{counts['synonyms']} synonyms loaded"
+        )
 
         return 0, result_message
diff --git a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
index dcba51ccb..cc90ea85c 100644
--- a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
@@ -14,19 +14,26 @@
 
 """Parser module for miRBase source."""
 
-from ensembl.production.xrefs.parsers.BaseParser import *
+import re
+from typing import Any, Dict, List, Tuple
 
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
 class miRBaseParser(BaseParser):
+    NAME_PATTERN = re.compile(r"^ID\s+(\S+)\s+", re.MULTILINE)
+    ACCESSION_PATTERN = re.compile(r"^AC\s+(\S+);\s+", re.MULTILINE)
+    DESCRIPTION_PATTERN = re.compile(r"^DE\s+(.*)", re.MULTILINE)
+    SPECIES_NAME_PATTERN = re.compile(r"(.+?)\s+stem(-|\s)loop")
+
     def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
-        source_id    = args["source_id"]
-        species_id   = args["species_id"]
-        species_name = args["species_name"]
-        file         = args["file"]
-        xref_dbi     = args["xref_dbi"]
+        source_id = args.get("source_id")
+        species_id = args.get("species_id")
+        species_name = args.get("species_name")
+        file = args.get("file")
+        xref_dbi = args.get("xref_dbi")
 
         if not source_id or not species_id or not file:
-            raise AttributeError("Need to pass source_id, species_id and file as pairs")
+            raise AttributeError("Missing required arguments: source_id, species_id, and file")
 
         # Get the species name(s)
         species_to_names = self.species_id_to_names(xref_dbi)
@@ -44,69 +51,61 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         self.upload_xref_object_graphs(xrefs, xref_dbi)
 
-        result_message = "Read %d xrefs from %s" % (len(xrefs), file)
-
+        result_message = f"Read {len(xrefs)} xrefs from {file}"
         return 0, result_message
 
     def create_xrefs(self, source_id: int, file: str, species_id: int, name_to_species_id: Dict[str, int]) -> List[Dict[str, Any]]:
         xrefs = []
 
-        # Read mirbase file
+        # Read miRBase file
         for section in self.get_file_sections(file, "//\n"):
-            if len(section) == 1:
-                continue
-
             entry = "".join(section)
             if not entry:
                 continue
 
-            xref = {}
-
-            (header, sequence) = re.split(r"\nSQ", entry, 2)
+            header, sequence = re.split(r"\nSQ", entry, 1)
             species = None
 
             # Extract sequence
             if sequence:
-                seq_lines = sequence.split("\n")
-                seq_lines.pop(0)
-
-                sequence = "".join(seq_lines)
-                sequence = sequence.upper()
-                sequence = re.sub("U", "T", sequence)
-                sequence = re.sub(r"[\d+,\s+]", "", sequence)
+                seq_lines = sequence.split("\n")[1:] # Remove newlines and drop the information line
+                sequence = "".join(seq_lines).upper() # Join into a single string and convert to uppercase
+                sequence = re.sub("U", "T", sequence) # Replace Us with Ts
+                sequence = re.sub(r"[\d+\s,]", "", sequence) # Remove digits, spaces, and commas
 
             # Extract name, accession, and description
-            name = re.search(r"^ID\s+(\S+)\s+", header, flags=re.MULTILINE).group(1)
-            accession = re.search(r"^AC\s+(\S+);\s+", header, flags=re.MULTILINE).group(
-                1
-            )
-            description = re.search(
-                r"^DE\s+(.+)\s+stem(-|\s)loop", header, flags=re.MULTILINE
-            ).group(1)
-
-            # Format description and extract species name
-            if description:
-                description_parts = re.split(r"\s+", description)
-                description_parts.pop()
-                species = " ".join(description_parts)
-                species = species.lower()
-                species = re.sub(" ", "_", species)
+            name_match = self.NAME_PATTERN.search(header)
+            accession_match = self.ACCESSION_PATTERN.search(header)
+            description_match = self.DESCRIPTION_PATTERN.search(header)
+
+            if not (accession_match and description_match):
+                continue
+
+            name = name_match.group(1)
+            accession = accession_match.group(1)
+            description = description_match.group(1)
+
+            # Extract species name from description
+            species_name_match = self.SPECIES_NAME_PATTERN.search(description)
+            species = species_name_match.group(1)
+            species = "_".join(species.split()[:-1]).lower()
 
             # If no species match, skip to next record
             species_id_check = name_to_species_id.get(species)
             if not species_id_check:
                 continue
 
-            if species_id and species_id == species_id_check:
+            if species_id == species_id_check:
                 xref = {
                     "SEQUENCE_TYPE": "dna",
                     "STATUS": "experimental",
                     "SOURCE_ID": source_id,
                     "ACCESSION": accession,
                     "LABEL": name,
-                    "DESCRIPTION": name,
+                    "DESCRIPTION": description,
                     "SEQUENCE": sequence,
                     "SPECIES_ID": species_id,
+                    "INFO_TYPE": "SEQUENCE_MATCH",
                 }
                 xrefs.append(xref)
 
diff --git a/src/python/test/xrefs/__init__.py b/src/python/test/xrefs/__init__.py
new file mode 100644
index 000000000..b82a66b8a
--- /dev/null
+++ b/src/python/test/xrefs/__init__.py
@@ -0,0 +1,15 @@
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Xref tests."""
diff --git a/src/python/test/xrefs/conftest.py b/src/python/test/xrefs/conftest.py
new file mode 100644
index 000000000..36b690013
--- /dev/null
+++ b/src/python/test/xrefs/conftest.py
@@ -0,0 +1,135 @@
+import pytest
+import os
+import io
+import re
+
+from datetime import datetime
+from unittest.mock import MagicMock
+from typing import Any, Generator, Callable
+
+from ensembl.utils.database import UnitTestDB, DBConnection
+from ensembl.xrefs.xref_update_db_model import Base
+from ensembl.production.xrefs.parsers.BaseParser import BaseParser
+
+# Fixture to set up a test database
+@pytest.fixture(scope="module")
+def test_db() -> Generator[None, None, None]:
+    # Create a unique database name using the current user and timestamp
+    user = os.environ.get("USER", "testuser")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    db_name = f"{user}_test_xref_{timestamp}"
+    mysql_url = f"mysql+pymysql://ensadmin:ensembl@mysql-ens-core-prod-1.ebi.ac.uk:4524/{db_name}"
+
+    # Create all tables defined in the Base metadata
+    with UnitTestDB(mysql_url, metadata=Base.metadata, name=db_name) as test_db:
+        yield test_db
+
+# Fixture to connect to the test database and close connection when done
+@pytest.fixture
+def mock_xref_dbi(test_db: UnitTestDB) -> Generator[Any, None, None]:
+    conn = test_db.dbc.connect()
+    yield conn
+    conn.close()
+
+# Common test for missing source_id
+@pytest.fixture
+def test_no_source_id() -> Callable[[BaseParser, int], None]:
+    def _test_no_source_id(parser_instance: BaseParser, species_id: int = 9606) -> None:
+        with pytest.raises(
+            AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?"
+        ):
+            parser_instance.run(
+                {
+                    "species_id": species_id,
+                    "file": "dummy_file.txt",
+                    "xref_dbi": MagicMock(),
+                }
+            )
+    return _test_no_source_id
+
+# Common test for missing species_id
+@pytest.fixture
+def test_no_species_id() -> Callable[[BaseParser, int], None]:
+    def _test_no_species_id(parser_instance: BaseParser, source_id: int = 1) -> None:
+        with pytest.raises(
+            AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?"
+        ):
+            parser_instance.run(
+                {
+                    "source_id": source_id,
+                    "file": "dummy_file.txt",
+                    "xref_dbi": MagicMock(),
+                }
+            )
+    return _test_no_species_id
+
+# Common test for missing file
+@pytest.fixture
+def test_no_file() -> Callable[[BaseParser, int, int], None]:
+    def _test_no_file(parser_instance: BaseParser, source_id: int = 1, species_id: int = 9606) -> None:
+        with pytest.raises(
+            AttributeError, match="Missing required arguments: source_id, species_id, and file"
+        ):
+            parser_instance.run(
+                {
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "xref_dbi": MagicMock(),
+                }
+            )
+    return _test_no_file
+
+# Common test for file not found
+@pytest.fixture
+def test_file_not_found() -> Callable[[BaseParser, int, int], None]:
+    def _test_file_not_found(parser_instance: BaseParser, source_id: int = 1, species_id: int = 9606) -> None:
+        with pytest.raises(FileNotFoundError, match=f"Could not find either"):
+            parser_instance.run(
+                {
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "file": "flatfiles/non_existent_file.txt",
+                    "xref_dbi": MagicMock(),
+                }
+            )
+    return _test_file_not_found
+
+# Common test for empty file
+@pytest.fixture
+def test_empty_file() -> Callable[[BaseParser, str, int, int], None]:
+    def _test_empty_file(parser_instance: BaseParser, source_name: str, source_id: int = 1, species_id: int = 9606) -> None:
+        mock_file = io.StringIO("")
+        parser_instance.get_filehandle = MagicMock(return_value=mock_file)
+
+        with pytest.raises(IOError, match=f"{source_name} file is empty"):
+            parser_instance.run(
+                {
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "file": "dummy_file.txt",
+                    "xref_dbi": MagicMock(),
+                }
+            )
+    return _test_empty_file
+
+@pytest.fixture
+def test_missing_required_source_id() -> Callable[[BaseParser, DBConnection, str, int, int, str], None]:
+    def _test_missing_required_source_id(parser_instance: BaseParser, mock_dbi: DBConnection, source_name: str, source_id: int = 1, species_id: int = 9606, priority_desc: str = None) -> None:
+        mock_file = io.StringIO("test file")
+        parser_instance.get_filehandle = MagicMock(return_value=mock_file)
+
+        if priority_desc is not None:
+            source_name = f"{source_name} ({priority_desc})"
+
+        with pytest.raises(
+            KeyError, match=re.escape(f"No source_id for source_name={source_name}")
+        ):
+            parser_instance.run(
+                {
+                    "source_id": source_id,
+                    "species_id": species_id,
+                    "file": "dummy_file.txt",
+                    "xref_dbi": mock_dbi,
+                }
+            )
+    return _test_missing_required_source_id
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/flatfiles/dbass3.txt b/src/python/test/xrefs/parsers/flatfiles/dbass3.txt
new file mode 100644
index 000000000..b03169479
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/dbass3.txt
@@ -0,0 +1,9 @@
+Id,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText
+1,GNAS1,GNAS complex locus,ENSG00000087460,Hereditary osteodystrophy,103580,IVS7-1G>A,Exon,+1,+1,AAGCAGGCTGACTATGTGCCGAGCGATCAGgtgtgcaaaacccctccccaccagaggactctgagccctctttccaaactactccagacctttgctttagattggcaattattactgtttcggttggctttggtgagatccattgacctcaattttgtttca(g>a)G/ACCTGCTTCGCTGCCGTGTCCTGACTTCTGGAATCTTTGAGACCAAGTTCCAGGTGGACAAAGTCAACTTCCAgtaagccaactgt,False,,chr20:58909349,chr20:58909350/58909351,10.35,4.02,9.95,4.55,8.06,4.73,12624854,"Rickard & Wilson. (2003) Analysis of GNAS1 and overlapping transcripts identifies the parental origin of mutations in patients with sporadic Albright hereditary osteodystrophy and reveals a model system in which to observe the effects of splicing mutations on translated and untranslated messenger RNA. Am. J. Hum. Genet., 72, 961-974."
+2,LDLR (LDLT),low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS9-30GTGCTGATGdelinsCGGCT,Exon,+54,0,TGGCCAGCAATAGAATCTACTGGTCTGACCTGTCCCAGAGAATGATCTGCAGgtgagcgtcgcccctgcctgcagccttggcccgcaggtgaGatgagggctcctg(gcgctgatg>cggct)cccttctctcctcctgcctcagCACCCAGCTTGACAGAGCCCACGGCGTCTCTTCCTATGACACCGTCATCAGCAG/AGACATCCAGGCCCCCGACGGGCTGGCTGTGGACTGGATCCACAGCAACATCTACTGGACCGACTCTGTCCTGGGCACTGTCTCTGTTGCGGATACCAAG/GGCGTGAAGAGGAAAACGTTATTCAGGGAGAACGGCTCCAAGCCAAGGGCCATCGTGGTGGATCCTGTTCATGGgtgcgtatccacgacgctgagg,False,,chr19:11113504,"chr19:11113588/11113589, chr19:11113688/11113689",6.76,2.79,9.02,0.95,11.67,1.22,8872473,"Webb et al. (1996) Genetic variation at a splicing branch point in intron 9 of the low density lipoprotein (LDL)-receptor gene: a rare mutation that disrupts mRNA splicing in a patient with familial hypercholesterolaemia and a common polymorphism. Hum. Mol. Genet., 5, 1325-1331."
+3,LDLR(LDLT),low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS1-1G>C,Exon,+10,+1,tgttcctgatcggatgacatttctggttaattctttagttggcaggaaatagacacaggaaacgtggtcagtttctgattctggcgttgagagaccctttctccttttcctctctctca(g>c)TGGGCGACAG/ATGCGAAAGAAACGAGTTCCAGTGCCAAGACGGGAAATGCATCTCCTACAAGTGGGTCTGCGATGGCAGCGCTGAGTGCCAGGATGGCTCTGATGAGTCCCAGGAGACGTGCTgtgagtcccctt,False,,chr19:11100222,chr19:11100232/11100233,9.99,5.28,9.98,4.24,14.12,4.06,10200052,"Maruyama et al. (1998) A novel point mutation in a splice acceptor site of intron 1 of the human low density lipoprotein receptor gene which causes severe hypercholesterolemia: an unexpected absence of exon skipping. Mutations in brief no. 139. Online. Hum. Mutat., 11, 480-481."
+4,LDLR/LDLT,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS7-1G>C,Exon,+17,+2,ctccgtctctagccattggggaagagcctccccaccaagcctctttctctctcttcca(g>c)ATATCGATGAGTGTCAG/GATCCCGACACCTGCAGCCAGCTCTGCGTGAACCTGGAGGGTGGCTACAAGTGCCAGTGTGAGGAAGGCTTCCAGCTGGACCCCCACACGAAGGCCTGCAAGGCTGTGGgtgagcacgggaa,False,,chr19:11111513,chr19:11111530/11111531,13.90,-3.91,13.40,-5.28,16.53,-1.77,10487495,"Yu et al. (1999) Familial hypercholesterolemia. Acceptor splice site (G-->C) mutation in intron 7 of the LDL-R gene: alternate RNA editing causes exon 8 skipping or a premature stop codon in exon 8. LDL-R(Honduras-1) [LDL-R1061(-1) G-->C]. Atherosclerosis, 146, 125-131."
+5,COL1A2,"collagen, type I, alpha 2",ENSG00000164692,Ehlers-Danlos syndrome,"120160, 120150",IVS5-1G>C,Exon,+15,0,ttgccctcttttaaataacaacagaaaaatatttacaagtagaatgagaaaatgaactacatgactagtaactaaaaatattttatatatatatataattttttttttttacttctcta(g>c)AACTTTGCTGCTCAG/TATGATGGAAAAGGAGTTGGACTTGGCCCTGGACCAATGgtatgcttatctgt,False,,chr7:94401566,chr7:94401581/94401582,10.62,2.87,11.17,3.61,14.81,4.66,1556139,"Chiodo et al. (1992) A base substitution at the splice acceptor site of intron 5 of the COL1A2 gene activates a cryptic splice site within exon 6 and generates abnormal type I procollagen in a patient with Ehlers-Danlos syndrome type VII. J. Biol. Chem., 267, 6361-6369."
+6,COL5A1,"collagen, type V, alpha 1",,Ehlers-Danlos syndrome,"120160, 120150",IVS4-2A>G,Exon,+12,0,cctagcttgagtgtcttttgtgagtggcagcttctagggagaatgtttggctctgaggacaagctcgtcttgtggcttggtctggactttcccctgcttcaaggcatggggctgtgtctcccaggtccccatgcgagtgctctgtgagctgctttttcatgagcgtctcttcttttcc(a>g)gGGTGACATCCAG/CAG/CTGCTCTTTGTCTCGGACCACCGGGCAGCTTATGATTACTGTGAGCACTACAGCCCTGACTGTGACACCGCAGTACCTGACACCCCACAGTCGCAGGACCCCAATCCAGATGAATATgtgagttaactctggc,False,,chr9:134727264,"chr9:134727277/134727278, chr9:134727280/134727281",9.58,1.45,9.51,0.45,12.17,1.95,12145749,"Takahara et al. (2002) Order of intron removal influences multiple splice outcomes, including a two-exon skip, in a COL5A1 acceptor-site mutation that results in abnormal pro-alpha1(V) N-propeptides and Ehlers-Danlos syndrome type I. Am. J. Hum. Genet., 71, 451-465."
+7,COL5A1,"collagen, type V, alpha 1",ENSG00000130635,Ehlers-Danlos syndrome,"120160, 120150",IVS4-2A>G,Exon,+12,0,cctagcttgagtgtcttttgtgagtggcagcttctagggagaatgtttggctctgaggacaagctcgtcttgtggcttggtctggactttcccctgcttcaaggcatggggctgtgtctcccaggtccccatgcgagtgctctgtgagctgctttttcatgagcgtctcttcttttcc(a>g)gGGTGACATCCAG/CAG/CTGCTCTTTGTCTCGGACCACCGGGCAGCTTATGATTACTGTGAGCACTACAGCCCTGACTGTGACACCGCAGTACCTGACACCCCACAGTCGCAGGACCCCAATCCAGATGAATATgtgagttaactctggc,False,,chr9:134727264,"chr9:134727277/134727278, chr9:134727280/134727281",9.58,1.45,9.51,0.45,12.17,1.95,12145749,"Takahara et al. (2002) Order of intron removal influences multiple splice outcomes, including a two-exon skip, in a COL5A1 acceptor-site mutation that results in abnormal pro-alpha1(V) N-propeptides and Ehlers-Danlos syndrome type I. Am. J. Hum. Genet., 71, 451-465."
+
diff --git a/src/python/test/xrefs/parsers/flatfiles/dbass5.txt b/src/python/test/xrefs/parsers/flatfiles/dbass5.txt
new file mode 100644
index 000000000..d0ce0f459
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/dbass5.txt
@@ -0,0 +1,8 @@
+Id,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,MaximumDependenceDecompositionModelAuthentic,MaximumDependenceDecompositionModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText
+1,GNAS1,GNAS complex locus,ENSG00000087460,Hereditary osteodystrophy,103580,IVS7-1G>A,Exon,+1,+1,AAGCAGGCTGACTATGTGCCGAGCGATCAGgtgtgcaaaacccctccccaccagaggactctgagccctctttccaaactactccagacctttgctttagattggcaattattactgtttcggttggctttggtgagatccattgacctcaattttgtttca(g>a)G/ACCTGCTTCGCTGCCGTGTCCTGACTTCTGGAATCTTTGAGACCAAGTTCCAGGTGGACAAAGTCAACTTCCAgtaagccaactgt,False,,chr20:58909349,chr20:58909350/58909351,10.35,4.02,9.95,4.55,8.06,4.73,12624854,"Rickard & Wilson. (2003) Analysis of GNAS1 and overlapping transcripts identifies the parental origin of mutations in patients with sporadic Albright hereditary osteodystrophy and reveals a model system in which to observe the effects of splicing mutations on translated and untranslated messenger RNA. Am. J. Hum. Genet., 72, 961-974."
+2,LDLR,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS9-30GTGCTGATGdelinsCGGCT,Exon,+54,0,TGGCCAGCAATAGAATCTACTGGTCTGACCTGTCCCAGAGAATGATCTGCAGgtgagcgtcgcccctgcctgcagccttggcccgcaggtgaGatgagggctcctg(gcgctgatg>cggct)cccttctctcctcctgcctcagCACCCAGCTTGACAGAGCCCACGGCGTCTCTTCCTATGACACCGTCATCAGCAG/AGACATCCAGGCCCCCGACGGGCTGGCTGTGGACTGGATCCACAGCAACATCTACTGGACCGACTCTGTCCTGGGCACTGTCTCTGTTGCGGATACCAAG/GGCGTGAAGAGGAAAACGTTATTCAGGGAGAACGGCTCCAAGCCAAGGGCCATCGTGGTGGATCCTGTTCATGGgtgcgtatccacgacgctgagg,False,,chr19:11113504,"chr19:11113588/11113589, chr19:11113688/11113689",6.76,2.79,9.02,0.95,11.67,1.22,8872473,"Webb et al. (1996) Genetic variation at a splicing branch point in intron 9 of the low density lipoprotein (LDL)-receptor gene: a rare mutation that disrupts mRNA splicing in a patient with familial hypercholesterolaemia and a common polymorphism. Hum. Mol. Genet., 5, 1325-1331."
+3,LDLR,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS1-1G>C,Exon,+10,+1,tgttcctgatcggatgacatttctggttaattctttagttggcaggaaatagacacaggaaacgtggtcagtttctgattctggcgttgagagaccctttctccttttcctctctctca(g>c)TGGGCGACAG/ATGCGAAAGAAACGAGTTCCAGTGCCAAGACGGGAAATGCATCTCCTACAAGTGGGTCTGCGATGGCAGCGCTGAGTGCCAGGATGGCTCTGATGAGTCCCAGGAGACGTGCTgtgagtcccctt,False,,chr19:11100222,chr19:11100232/11100233,9.99,5.28,9.98,4.24,14.12,4.06,10200052,"Maruyama et al. (1998) A novel point mutation in a splice acceptor site of intron 1 of the human low density lipoprotein receptor gene which causes severe hypercholesterolemia: an unexpected absence of exon skipping. Mutations in brief no. 139. Online. Hum. Mutat., 11, 480-481."
+4,LDLR,low density lipoprotein receptor,ENSG00000130164,Familial hypercholesterolemia,"143890, 144010",IVS7-1G>C,Exon,+17,+2,ctccgtctctagccattggggaagagcctccccaccaagcctctttctctctcttcca(g>c)ATATCGATGAGTGTCAG/GATCCCGACACCTGCAGCCAGCTCTGCGTGAACCTGGAGGGTGGCTACAAGTGCCAGTGTGAGGAAGGCTTCCAGCTGGACCCCCACACGAAGGCCTGCAAGGCTGTGGgtgagcacgggaa,False,,chr19:11111513,chr19:11111530/11111531,13.90,-3.91,13.40,-5.28,16.53,-1.77,10487495,"Yu et al. (1999) Familial hypercholesterolemia. Acceptor splice site (G-->C) mutation in intron 7 of the LDL-R gene: alternate RNA editing causes exon 8 skipping or a premature stop codon in exon 8. LDL-R(Honduras-1) [LDL-R1061(-1) G-->C]. Atherosclerosis, 146, 125-131."
+5,COL1A2,"collagen, type I, alpha 2",ENSG00000164692,Ehlers-Danlos syndrome,"120160, 120150",IVS5-1G>C,Exon,+15,0,ttgccctcttttaaataacaacagaaaaatatttacaagtagaatgagaaaatgaactacatgactagtaactaaaaatattttatatatatatataattttttttttttacttctcta(g>c)AACTTTGCTGCTCAG/TATGATGGAAAAGGAGTTGGACTTGGCCCTGGACCAATGgtatgcttatctgt,False,,chr7:94401566,chr7:94401581/94401582,10.62,2.87,11.17,3.61,14.81,4.66,1556139,"Chiodo et al. (1992) A base substitution at the splice acceptor site of intron 5 of the COL1A2 gene activates a cryptic splice site within exon 6 and generates abnormal type I procollagen in a patient with Ehlers-Danlos syndrome type VII. J. Biol. Chem., 267, 6361-6369."
+6,COL5A1,"collagen, type V, alpha 1",ENSG00000130635,Ehlers-Danlos syndrome,"120160, 120150",IVS4-2A>G,Exon,+12,0,cctagcttgagtgtcttttgtgagtggcagcttctagggagaatgtttggctctgaggacaagctcgtcttgtggcttggtctggactttcccctgcttcaaggcatggggctgtgtctcccaggtccccatgcgagtgctctgtgagctgctttttcatgagcgtctcttcttttcc(a>g)gGGTGACATCCAG/CAG/CTGCTCTTTGTCTCGGACCACCGGGCAGCTTATGATTACTGTGAGCACTACAGCCCTGACTGTGACACCGCAGTACCTGACACCCCACAGTCGCAGGACCCCAATCCAGATGAATATgtgagttaactctggc,False,,chr9:134727264,"chr9:134727277/134727278, chr9:134727280/134727281",9.58,1.45,9.51,0.45,12.17,1.95,12145749,"Takahara et al. (2002) Order of intron removal influences multiple splice outcomes, including a two-exon skip, in a COL5A1 acceptor-site mutation that results in abnormal pro-alpha1(V) N-propeptides and Ehlers-Danlos syndrome type I. Am. J. Hum. Genet., 71, 451-465."
+
diff --git a/src/python/test/xrefs/parsers/flatfiles/entrezgene.txt b/src/python/test/xrefs/parsers/flatfiles/entrezgene.txt
new file mode 100644
index 000000000..bc07aa246
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/entrezgene.txt
@@ -0,0 +1,13 @@
+#tax_id	GeneID	Symbol	LocusTag	Synonyms	dbXrefs	chromosome	map_location	description	type_of_gene	Symbol_from_nomenclature_authority	Full_name_from_nomenclature_authority	Nomenclature_status	Other_designations	Modification_date	Feature_type
+9606	1	A1BG	-	A1B|ABG|GAB|HYST2477	MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410	19	19q13.43	alpha-1-B glycoprotein	protein-coding	A1BG	alpha-1-B glycoprotein	O	alpha-1B-glycoprotein|HEL-S-163pA|epididymis secretory sperm binding protein Li 163pA	20181208	-
+9606	2	A2M	-	A2MD|CPAMD5|FWP007|S863-7	MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899	12	12p13.31	alpha-2-macroglobulin	protein-coding	A2M	alpha-2-macroglobulin	O	alpha-2-macroglobulin|C3 and PZP-like alpha-2-macroglobulin domain-containing protein 5|alpha-2-M	20181208	-
+9606	2	A2M	-	A2MD	-	12	-	alpha-2-macroglobulin	protein-coding	-	-	O	-	20181208	-
+9606	3	A2MP1	-	A2MP	HGNC:HGNC:8|Ensembl:ENSG00000256069	12	12p13.31	alpha-2-macroglobulin pseudogene 1	pseudo	A2MP1	alpha-2-macroglobulin pseudogene 1	O	pregnancy-zone protein pseudogene	20180329	-
+9606	9	NAT1	-	AAC1|MNAT|NAT-1|NATI	MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428	8	8p22	N-acetyltransferase 1	protein-coding	NAT1	N-acetyltransferase 1	O	arylamine N-acetyltransferase 1|N-acetyltransferase 1 (arylamine N-acetyltransferase)|N-acetyltransferase type 1|arylamide acetylase 1|monomorphic arylamine N-acetyltransferase	20181207	-
+9606	10	NAT2	-	AAC2|NAT-2|PNAT	MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006	8	8p22	N-acetyltransferase 2	protein-coding	NAT2	N-acetyltransferase 2	O	arylamine N-acetyltransferase 2|N-acetyltransferase 2 (arylamine N-acetyltransferase)|N-acetyltransferase type 2|arylamide acetylase 2	20181207	-
+9606	11	NATP	-	AACP|NATP1	HGNC:HGNC:15	8	8p22	N-acetyltransferase pseudogene	pseudo	NATP	N-acetyltransferase pseudogene	O	arylamide acetylase pseudogene	20180329	-
+9606	12	SERPINA3	-	AACT|ACT|GIG24|GIG25	MIM:107280|HGNC:HGNC:16|Ensembl:ENSG00000196136	14	14q32.13	serpin family A member 3	protein-coding	SERPINA3	serpin family A member 3	O	alpha-1-antichymotrypsin|cell growth-inhibiting gene 24/25 protein|growth-inhibiting protein 24|growth-inhibiting protein 25|serine (or cysteine) proteinase inhibitor, clade A, member 3|serpin A3|serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3	20181208	-
+9606	13	AADAC	-	CES5A1|DAC	MIM:600338|HGNC:HGNC:17|Ensembl:ENSG00000114771	3	3q25.1	arylacetamide deacetylase	protein-coding	AADAC	arylacetamide deacetylase	O	arylacetamide deacetylase|arylacetamide deacetylase (esterase)	20181207	-
+9606	14	AAMP	-	-	MIM:603488|HGNC:HGNC:18|Ensembl:ENSG00000127837	2	2q35	angio associated migratory cell protein	protein-coding	AAMP	angio associated migratory cell protein	O	angio-associated migratory cell protein	20181208	-
+9606	15	AANAT	-	DSPS|SNAT	MIM:600950|HGNC:HGNC:19|Ensembl:ENSG00000129673	17	17q25.1	aralkylamine N-acetyltransferase	protein-coding	AANAT	aralkylamine N-acetyltransferase	O	serotonin N-acetyltransferase|arylalkylamine N-acetyltransferase|serotonin acetylase	20181209	-
+9313	16	rpsP	FUT79_RS10890	FUT79_10890	-	-	-	30S ribosomal protein S16	protein-coding	-	-	-	30S ribosomal protein S16	20240427	-
diff --git a/src/python/test/xrefs/parsers/flatfiles/hgnc.txt b/src/python/test/xrefs/parsers/flatfiles/hgnc.txt
new file mode 100644
index 000000000..824587793
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/hgnc.txt
@@ -0,0 +1,21 @@
+HGNC ID	Approved symbol	Approved name	Previous symbols	Alias symbols	NCBI Gene ID	Ensembl gene ID	RefSeq IDs	CCDS IDs	Locus specific databases
+HGNC:5	A1BG	alpha-1-B glycoprotein			1	ENSG00000121410	NM_130786	CCDS12976	
+HGNC:37133	A1BG-AS1	A1BG antisense RNA 1	NCRNA00181, A1BGAS, A1BG-AS	FLJ23569	503538	ENSG00000268895	NR_015380		
+HGNC:24086	A1CF	APOBEC1 complementation factor		ACF, ASP, ACF64, ACF65, APOBEC1CF	29974	ENSG00000148584	NM_014576	CCDS7241, CCDS7242, CCDS7243, CCDS73133	
+HGNC:7	A2M	alpha-2-macroglobulin		FWP007, S863-7, CPAMD5	2	ENSG00000175899	NM_000014	CCDS44827	
+HGNC:27057	A2M-AS1	A2M antisense RNA 1			144571	ENSG00000245105	NR_026971		
+HGNC:41022	A2ML1-AS1	A2ML1 antisense RNA 1							
+HGNC:8	A2MP1	alpha-2-macroglobulin pseudogene 1	A2MP		3	ENSG00000256069	NG_001067		
+HGNC:30005	A3GALT2	alpha 1,3-galactosyltransferase 2	A3GALT2P	IGBS3S, IGB3S	127550	ENSG00000184389	NM_001080438	CCDS60080	
+HGNC:18149	A4GALT	alpha 1,4-galactosyltransferase (P blood group)	P1	A14GALT, Gb3S, P(k)	53947	ENSG00000128274	NM_017436	CCDS14041	 "Global Variome shared LOVD|https://databases.lovd.nl/shared/genes/ABCB7","LRG_795|https://ftp.ebi.ac.uk/pub/databases/lrgex/pending/LRG_795.xml"
+HGNC:17968	A4GNT	alpha-1,4-N-acetylglucosaminyltransferase		alpha4GnT	51146	ENSG00000118017	NM_016161	CCDS3097	"Global Variome shared LOVD|https://databases.lovd.nl/shared/genes/ACE2"
+HGNC:13666	AAAS	aladin WD repeat nucleoporin			8086	ENSG00000094914		CCDS8856, CCDS53797	
+HGNC:30205	AAMDC	adipogenesis associated Mth938 domain containing	C11orf67	PTD015, FLJ21035, CK067	28971	ENSG00000087884	NM_024684	CCDS8254, CCDS81604, CCDS81605, CCDS86232	
+HGNC:18	AAMP	angio associated migratory cell protein			14	ENSG00000127837	NM_001087	CCDS33378, CCDS77530	
+HGNC:19	AANAT	aralkylamine N-acetyltransferase		SNAT	15	ENSG00000129673	NM_001088	CCDS11745, CCDS54169	
+HGNC:15886	AAR2	AAR2 splicing factor homolog	C20orf4	bA234K24.2	25980	ENSG00000131043	NM_015511	CCDS13273	
+HGNC:33842	AARD	alanine and arginine rich domain containing protein	C8orf85	LOC441376	441376	ENSG00000205002	NM_001025357	CCDS34935	
+HGNC:20	AARS	alanyl-tRNA synthetase		CMT2N, AlaRS	16	ENSG00000090861	NM_001605	CCDS32474	"LRG_359|https://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_359.xml"
+HGNC:21022	AARS2	alanyl-tRNA synthetase 2, mitochondrial	AARSL	KIAA1270, bA444E17.1	57505	ENSG00000124608	NM_020745	CCDS34464	
+HGNC:28417	AARSD1	alanyl-tRNA synthetase domain containing 1		MGC2744	80755	ENSG00000266967	NM_001261434	CCDS11447, CCDS45691, CCDS58552	
+HGNC:49894	AARSP1	alanyl-tRNA synthetase pseudogene 1				ENSG00000249038		
diff --git a/src/python/test/xrefs/parsers/flatfiles/hpa.txt b/src/python/test/xrefs/parsers/flatfiles/hpa.txt
new file mode 100644
index 000000000..5deb80b99
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/hpa.txt
@@ -0,0 +1,11 @@
+Antibody,antibody_id,ensembl_peptide_id,link
+CAB000001,1,ENSP00000363822,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000001,1,ENSP00000379358,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000001,1,ENSP00000379359,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000001,1,ENSP00000421155,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000001,1,ENSP00000479013,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000001,1,ENSP00000482407,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000001,1,ENSP00000484033,http://www.proteinatlas.org/ENSG00000169083-AR
+CAB000002,2,ENSP00000224784,http://www.proteinatlas.org/ENSG00000107796-ACTA2
+CAB000002,2,ENSP00000396730,http://www.proteinatlas.org/ENSG00000107796-ACTA2
+CAB000002,2,ENSP00000398239,http://www.proteinatlas.org/ENSG00000107796-ACTA2
diff --git a/src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta b/src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta
new file mode 100644
index 000000000..25208108b
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/jgi_protein.fasta
@@ -0,0 +1,108 @@
+>ci0100130000
+MPLEENISSSKRKPGSRGGVSFFSYFTQELTHGYFMDQNDARYTERRERVYTFLKQPREIEKVRPFPPFL
+CLDVFLYVFTFLPLRVLFALLKLLSAPFCWFQRRSLLDPAQSCDLLKGVIFTSCVFCMSYIDTSIIYHLV
+RAQTLIKLYIIYNMLEVADRLFSSFGQDILDALFLTATESNRQKRESFRVLLHLILAVIYVFSHAVLVLF
+EATTLNVAFNSHNKVLLTIMMANNFVEIKGTVFKKYDKNNLFQISCSDIRERFHYFALMLVVLLRNMQQY
+SWNYEHFTEIIPNMLMLLSSECVVDWFKHAFVLKFNHIPIESYSEYRATLAYDVASSRHKDSINDHSDVV
+SRRLGFIPLPLAVLVSYSSALLLPVSDFSVCSSVLVYRIKKRFV*MHFSSLTLLKVFNSIVIVGKACCYI
+SDDEAQAANVRVNGARIAVVDPFEQRGNKTILVSQARAQPPEPTVKPPASGDPGLDSKKLLLSPEKNRKL
+PKEVTTPARLRSMRAPSVDHTVAAGTNLPSRNDDDVGDVDVLRHQAPDSVRSRKRHTATIVKATAIDEEI
+H*
+>ci0100130001
+MLPIVDFKQCRPSVEASDKEINETAKLLVDALSTVGFAYLKNCGIKKNCRRSQKHRG*MGGVRYLYYPPI
+KGELELNQERLGEHSDYGSITLLFVDDNGGLQIETEGTYKDVPVIEDTILINIGDALEFWTKGKLRSTKH
+RVNIPDDEVKRNSIRRSIGYFVFPDDDVVINQPLQFKGDADVPDPVKDPITALKYIQQKLSHTCQNT*
+>ci1100130002
+MNWKTWEEMENDLGIYYRPTNRKLDRRKGPIEEGQINFKITIPSTLKRKIKHDVDKNLNEELIENADKQQ
+NTEEQSHSMDQIFSSTQIGASVSHNVEDLHSVKRPRLSPIIAKSKPAVHSTSVIINPSDEESDSVFDKTK
+SRADVSHKSIPIHADENLAQSSVHLDVENSVLSDKSFDNSKNASNRFDLPATASKPTKSTQQNESEMFLI
+SESATLNESYHQVLSKAKHFLGKFKPKKIPLKVNNNQTKTSNTDKPRKIKPPKGFDGFAVVPPINPASSS
+AKHRTTSTEVNRISSNLAQWRYTLEQRLLSQSSDS*MVASAIYVLDLKGKVLISRNYRGNIPMNAIDAFP
+KLLLEQEEEGTLTPVLMHGDITFVFIRFSNLYMVATTNKNSNVMMISSFMHKLCQIFAHYFKELEEESIK
+DNFVIVYELFDEVMDFGYPQFSDPKILQEYITQEGHKLEIQVRPPSTVTNAVSWRSEGLKYRKNEVFLDV
+IESVNLLVSSTGNVLRSEIVGSVKMRVYLTGMPELRLGLNDKVLFQNTGRGKSKAVEMEDVKFHQCVRLS
+RFENDRTISFIPPDGEFELMSYRLNTHVKPLIWIESVIERHSHSRVEIMVKAKSQFKRRSTANNVEIQIP
+VPNDADTPKFKTSVGSVKWVPETSNIVWTVKSFPGGKEYLMRAHFGLPSVESEELEGKPPISVKFEIPYF
+TTSGIQVRYLKIIEKSGYQALPWVRYITQNGDYQLRTN*
+>ci0100130003
+MPPKKKKEVEKPPLILGRLGTSLKIGIVGLPNVGKSTFFNVLTKSEASAENFPFCTIDPNESRVPVPDER
+WEFLCKYHKPASKVPAFLSVVDIAGLVKGANEGQGLGNAFLSHISGCDAIFHMTRAFDDAEVVHVEGDVN
+PVRDLEIIQEELRLKDVEHLTKRLAELEKVYSRGGEKKYKLEFETLSKIKTLLVDEKKPVRDGEWGGKEI
+EVLNEHLFLTSKPQIYLVNLSEKDYIRKKNKWLMKIKTWVTENDSSAILIPFSGAFELKLAEMADDAERK
+AYLEEQYKDSVGSALSKIVVTGFKCLGLQYFFTAGADEVKAWTIKTGFLAPQAAGRIHTDFEKGFIMAEV
+MKFSDFKELGSESAVKSAGKYRQQGRNYIVEDGDIIFFKFNTPSQPKKK*MSQLAEMADDAERKAYLEEQ
+YKDSVGSALSKIVVTGFKCLGLQYFFTAGADEVKAWTIKTGFLAPQAAGRIHTDFEKGFIMAEVMKFSDF
+KELGSESAVKSAGKYRQQGRNYIVEDGDIIFFKFNTPSQPKKK*
+>ci0100130004
+VFKVCLNHFITEAIHFNFKENSDKVLVWAATDFSDPDKPNGEMLQFAMKLKSAETAINFLNTVQDGNEAF
+SVKRLLDPVSVLEDKSEINTSVQDASNASQQSENGNTSVHKSPVKSTKPAFSFANVATPFGNKNKPLFSD
+IVFGMLMVLFNFFK*MTSQPGTPKTQPAAPTTKFTFDASSISFNFGSTSTPATSAPPFQIAPAVMQKPAA
+SKSLFGVVQPSTGNNDASQQKEQNTIIGQHSSKFQFDMATADNDQNGVDTNVGEKETKKVEKGKNSIFLC
+SCYMYSYIICSAPWLSGLGACIGTKVFPVQCSPAILTLITYVSLGKTLNGHCSNPAVTNGLSKS*MLFTG
+ASQSPDADPEAFNPDYKPVVAELPPLIEMKTGEEEEEILFKERCKMFRFDNSISNWKERGLGELKILFHK
+GMNLHRVVMRREQVFKVCANHLITKDMNLLPNSDKSWMYVANNKSDGEAEVEKLSVKFKTPQIANQFKEI
+WDTCRHGS*
+>ci0100130005
+CQICFETYTRPKSLNCQHTFCLKCLEEYTPPNSVRVICPTCRSEQPLTADGINGLKDNFFISSMSDMLKT
+VKEIRSEDKDGTSLMCDTCDHDNRKVAIARCLDCTDFLCNECSTWHIRTKLTRRHKIVSLSEFESGIHNQ
+ELKSRAKIYCMIHDGEAAKIYCQSCQCPICHECVESGHSRHQLGKQGIADGEFESTPNLAINSVNEVITA
+DYDGAKIQIFDPQGNFKDSFVTEVRGVNKRMCKPAGIAILDNDDIVVCCEDQVHIWTHEGKSVLGFGKGQ
+FGNCSSIAVNSENRIVVADVGKHCISVFTDTGKMLLQFGAQGKGESKLVEPRYVACDSQNNIIVSDGGDC
+SVKKFSSQGEFLLSFGAEGPERGQFQGPRGLCTDEHDNILVADCWNHRVDIFTPDGCFMRHIATGADSLH
+FPWCISLTTNGKLVLSEDYSWSVKIF
+>ci0100130006
+MEKDTSLVKVVTEGNGNILKPFTNSNNVELHPMEDEVVVAALEQMSPAEQWELKQQKKREMKMQNELREK
+IEAGCKIRQILASPDWKGSIDKLLIAKVKSRDWREVESIFALLKKEEYSKFSCQQACDEQMRGPLRIAVE
+NKDIKMLELLLSEDIIKNEKIKVK*MISKARSSPAYILALGDVGKVNKLIARSKFIYEYLLIQADRNFDT
+FLYCIKHIYELRQLAKVEHEFSNFYLQLVEDVEKFMCKLLDQYIFQSSPECDINNGLEEIGIGTRVRMLE
+KACDYKLVNFVTHHNPQLAIEHLTYRNTPFFRTGNHITFYLTRIMLALMFPVLSIFNIINPKSRAGRLIT
+YPCTSYDCRMMSEFLFVVFLVTNISNKKMHLEYLAAPPTTWEVLILIWVMGKFVQEINELNKRGLESYFF
+DPWNHLDLWATILFAFNYAFRIVDYVKYHQVPVQQRPPRSEWYMFEWRLVAEGLMACAYVFVFIRLLGLT
+RVDRTLGPLQISLARMVKDVVQFLCIFAFILFAFALALTELYWFYGTPKGKEISCDVGVRSNLTNTTASC
+PEINTMFHSVWYSMIDLFWSLFGQLDMSKLSLSGKHLFTEYVAKALLAIYHVIAIIVLLNMLIAMMSRSY
+ERTSENEEKEWKFQRTKMWIRILRREIIRPPPMNLLPSFKTIWYYLKRLKRLCCFFLVHLIRCRCSTIKR
+SFFPGQHRVKYQALNYHKARRNLISKYKTNILLSSENDCT*
+>ci0100130007
+MDRIENLLDTGGRYLSPADRQFVSTLLSELEQFQYQAPKDRALMFHPLGGFQRYLIHKVTEVFPKLTSFS
+IGDDSNRRTVVCFKSKKKDQQQGLQANGTSKTLPNPKNEYVNAPVRGREEEGRQSAPRSRDSSRSRKKNE
+PYDANTSKQPRQPKQLDQAYLPKPLRTKKATNKRDQMKRSRSLQSSPVRMDEEVYHTDDDGRRGRRSKGS
+ESGKRAPRSASLKPSSRRRERDVSPEPSEDEFRRQKQHPPLRHNVSDVSIHKRSQGLRMEGKVIKLLSLC
+VSQQLWFTNNSVVTYLFTLG*MDEDSSNDYYEDEDSASMASDERPPSYERVKHSSSIKKAERNLNRIKRV
+DSQRSKPTSDEDVQIRDSSSSSSGRTVPRGSTHKSPPASTSSHHSKEGKMQRSSTMPRSTKEPAKKQRSS
+SKGRSQTLKGPPVSTGRSKSQSTKKHPAPPPPQKEEEKPKRSSSSKQRKKPTSLDEPKKHSSVVTSTDDL
+LDASNSTLYHPLPDQEQPSTSKKKSTKEXTPPSDPNAISPVTCDGTAQTEVSGTLSRRKDNRLRESTDTT
+PQDSPSHQPSNPDYSKDDVDGFLSGSHASSCSSLNSAPPVPKETKNTEAPPKETTQPPIPTENIPKPQDI
+EPATNSVDVEVHENDEATGAEENSESESDAASTKSNVGSIDRSAGDISIYSLDASDGSDKEDEDDKQESD
+KERSGESSDSEEATWPAPPVPVNGDGVDKFKSPVSGDEQPSPPQEPATVTQTIDIVTSPTDNADNEEIEQ
+HQSAGSVTSSSSEPFAAPQHPEVPLIDVSDSDASSTATEGEKAAENGDLGNGEGERSNSSSDLSTSDENH
+KMSDGSYDGDQSRTEESILADKEQPATNGTNQVVEVDGEMVVVQEPQFDYYKWKPDQDVWKSPEYKKFVE
+IFNFPASMSDIEVTQHLSKYRGLRLARVDATHALCTLPSDVIAEELADQHFTAFETRPLCDASKQTKAKA
+KAKLENEKFEEVRAQRPKSSNAVAKRMIAGALGSSHRSSANKQSTKK*
+>ci0100130008
+MKEFLVWSKMNRTTKLKLNGACTVPLSMKGINLKRVKIKRLKCGVWGCTISSNISFVSKHTEIKVLKTTA
+KYCDRVNVTNDMGNFTWHEAVTGVTVRYNCTEPQQEVS*MTENLDPGLDLYSLSVLPQKLHVHNCIKYRT
+GYTCDTTQMMLERNISRENAWSVCSNAVGNLSGVIVRDSIGFQLLGEVIGNCSEHLQENMKSSNIALAGY
+NFNVTDQKYIYCSISEAIEWTELTKYPELELVCQDLLLFNFSILLFFHSFLFFILHTYITITVTLGVFVA
+TTFKTIHLLHPGFKSKTAFSLLLFVHTN*MRHLPILHRNGNLFQSSTTTTMSLMEYKAWKNHLQSSQQED
+KGKIVAKQVDFLADMTKFKQELKKNIKPDPERHVLVSSIYSVHVNNRNVVNLTKPAVYIFHTQESVSQLN
+LDYEHYIAMWNGSGWNRAHHHCIFNHTTRDQQQHNITVIQCDVLATFAIMKV*TLTHPTVYAGSAVLTLT
+LLLMLITYAVFRNLLLSRDARHMIINTTLHLLVAVLTFTVGVWSISSKVMCYVTGILLHYSSLSVLLWIT
+LSSGNICKEMLAAQQPPLLEPKPSKPMLRFYLIGCGIPIIICGITASAKIENYNGDGGQYCWLSWETSLY
+AFYAPAACIAIFCILVLLRILATLNCAPSGEMKRKSRKRRKREYSKEFIGEDTPLQYMESSFTQHSNPVN
+NSFSKDVENEQSSKTRLQGVALILVLFITTWVTAAMTVAAPRIQEQKISSHVRQTLFNQNYFKFFKQEEP
+TPSIDLHLIFSCIFAMMCIAMSSFLLIQHLTSRSDVRRSWRNLCNRRKKKVLEAESNIQINNDIATNIPD
+RKRENTVTTAATDPTTTETALHLTTGGESSRGNLTSRHSPFGRNSSAPLPGGTYDTSRASSAHKCAQFHR
+EKALSNTLTESGLLVPHSNSSLLLPSDPNNFYTLTEHQHGDSFHSPSQEWGYHGYGQHYYKSSSKPVKMT
+NLQQHQLDSSMTEHSFDDSHNNMHTIPVVLQHTQPKIDNKVLYHRYQKMRKALDAKRNRQKKLTVLREYA
+QDPLTSNDESPTKPQKSIDKSSEQIPLLSNVSKCHNANEDEMGLDNLVNTDNIITLPMPKPADEGEQNYR
+LLLISSPNKKKGETCMQGKYGQRKRVTGDPNLVSSYRRKSLQQNTVPLEPGGSSRSAKRRRPPASRRRAR
+QRATRHETQIKAAASHTEATASQVEATAPPVTDNVSVKSRMSANQAAAEARSSGWAMHQDKAHNYLTAQD
+LFSGAMHLPSRKSSSCQAPSNTEPQNPGFQIIDMDDNNVGHENATTSPTEDTDLAYAALRNETSV*
+>ci0100130009
+MNNIVPSMVKERIRLIKPSRPQAPTEYNLQYKKWPLRPFTLDTFRVVKKQPKKELIEHKDPITQPTHGLC
+KMGSFNSDYPCSTASSDGLPTAAVEKRQNKTNDSIVALPPSIDVGEVIDESLSSTLRTQTAKMHERQPKQ
+KSPRKPGKSSGKYASKNREGRSGSSFQAVESXMNEDGRQWASDQPPNTEYLQNLPSDNSLDKINPVRSES
+IILKPFDQQTNDLPRTPQKSCKHLVSKEGRRSKPFSVDSVGEDSLRQLYSNIPPNDLETLRKTYKGLLGG
+FIKVYIWLVLLINTATHLNCLIAVKQNTCVKTTASGV*MKMNMTSSSTFKAHKPKPPEAPIWANKTIVSL
+GSGHWNLPMNKGEEWKVQIPEDCKVPVGQKPEYKIRVHNDAKSSDVPMTDLESNEKKTDRTNSDPFIALY
+YPDRRTTSSYVSEYMRKFHRFPVQKKRVLV*
diff --git a/src/python/test/xrefs/parsers/flatfiles/mgi.txt b/src/python/test/xrefs/parsers/flatfiles/mgi.txt
new file mode 100644
index 000000000..9cae91f0e
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/mgi.txt
@@ -0,0 +1,10 @@
+MGI:1915733	1110002O04Rik	RIKEN cDNA 1110002O04 gene	-1.0	1	ENSMUSG00000102531	ENSMUST00000194261		lincRNA gene	35879845	35881119	+	lincRNA|ncRNA
+MGI:1926146	1500015O10Rik	RIKEN cDNA 1500015O10 gene	23.44	1	ENSMUSG00000026051	ENSMUST00000027217	ENSMUSP00000027217	protein coding gene	43730602	43742564	+	protein-coding|protein_coding
+MGI:1919275	1600012P17Rik	RIKEN cDNA 1600012P17 gene	68.49	1	ENSMUSG00000047661	ENSMUST00000062159 ENSMUST00000162474		lincRNA gene	158967701	158980463	-	lincRNA
+MGI:1914753	1700001G17Rik	RIKEN cDNA 1700001G17 gene	12.78	1	ENSMUSG00000103746			lncRNA gene	33669824	33670712	+	TEC|ncRNA
+MGI:1916606	1700003I22Rik	RIKEN cDNA 1700003I22 gene	-1.0	1	ENSMUSG00000100372	ENSMUST00000190280 ENSMUST00000186048		lincRNA gene	56018978	56020203	+	ncRNA|lincRNA
+MGI:1925628	1700006P03Rik	RIKEN cDNA 1700006P03 gene	-1.0	1	ENSMUSG00000102738			unclassified gene	137325434	137325842	-	TEC
+MGI:1916558	1700007P06Rik	RIKEN cDNA 1700007P06 gene	90.37	1	ENSMUSG00000089730	ENSMUST00000160380		antisense lncRNA gene	187125138	187127852	+	ncRNA|antisense
+MGI:1923817	1700012E03Rik	RIKEN cDNA 1700012E03 gene	-1.0	1	ENSMUSG00000101275	ENSMUST00000186237		lincRNA gene	120435805	120438455	+	lincRNA
+MGI:1916678	1700016C15Rik	RIKEN cDNA 1700016C15 gene	82.8	1	ENSMUSG00000015962	ENSMUST00000016106	ENSMUSP00000016106	protein coding gene	177729814	177753324	+	protein_coding|protein-coding
+MGI:1919458	1700016L21Rik	RIKEN cDNA 1700016L21 gene	-999.0	1	ENSMUSG00000101483	ENSMUST00000187497 ENSMUST00000189139		antisense lncRNA gene	80445932	80475660	+	antisense|ncRNA
diff --git a/src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt b/src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt
new file mode 100644
index 000000000..f3ffe6388
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/mgi_desc.txt
@@ -0,0 +1,11 @@
+MGI Accession ID	Chr	cM Position	genome coordinate start	genome coordinate end	strand	Marker Symbol	Status	Marker Name	Marker Type	Feature Type	Marker Synonyms (pipe-separated)
+MGI:1341858	5	syntenic				03B03F	O	DNA segment, 03B03F (Research Genetics)	BAC/YAC end	BAC/YAC end	
+MGI:1341869	5	syntenic				03B03R	O	DNA segment, 03B03R (Research Genetics)	BAC/YAC end	BAC/YAC end	
+MGI:1337005	11	syntenic				03.MMHAP34FRA.seq	O	DNA segment, 03.MMHAP34FRA.seq	DNA Segment	DNA segment	
+MGI:1918911	7	29.36	45567795	45575176	-	0610005C13Rik	O	RIKEN cDNA 0610005C13 gene	Gene	antisense lncRNA gene	
+MGI:1923503	7	syntenic	74818818	74853813	-	0610006L08Rik	O	RIKEN cDNA 0610006L08 gene	Gene	lincRNA gene	
+MGI:1925547	UN	N/A				0610008J02Rik	O	RIKEN cDNA 0610008J02 gene	Gene	unclassified gene	
+MGI:1913300	11	31.26	51685386	51688874	-	0610009B22Rik	O	RIKEN cDNA 0610009B22 gene	Gene	protein coding gene	
+MGI:3698435	2	18.90	26445605	26457995	+	0610009E02Rik	O	RIKEN cDNA 0610009E02 gene	Gene	unclassified non-coding RNA gene	
+MGI:1918921	16	syntenic	91947326	91947785		0610009F21Rik	O	RIKEN cDNA 0610009F21 gene	Gene	unclassified gene	
+MGI:1926146	1	23.44	43730602	43742564	+	1500015O10Rik	O	RIKEN cDNA 1500015O10 gene	Gene	protein coding gene	Ecrg4|augurin
diff --git a/src/python/test/xrefs/parsers/flatfiles/mim.txt b/src/python/test/xrefs/parsers/flatfiles/mim.txt
new file mode 100644
index 000000000..e44b71186
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/mim.txt
@@ -0,0 +1,122 @@
+*RECORD*
+*FIELD* NO
+100050
+*FIELD* TI
+100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT
+*FIELD* TX
+
+DESCRIPTION
+
+Aarskog syndrome is characterized by short stature and facial, limb,
+and genital anomalies. One form of the disorder is X-linked (see
+305400), but there is also evidence for autosomal dominant and
+autosomal recessive (227330) inheritance (summary by Grier et al.,
+1983).
+
+*RECORD*
+*FIELD* NO
+100070
+*FIELD* TI
+%100070 AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1
+;;ANEURYSM, ABDOMINAL AORTIC; AAA;;
+ABDOMINAL AORTIC ANEURYSM
+*FIELD* TX
+
+DESCRIPTION
+
+Abdominal aortic aneurysm is a multifactorial disorder with multiple
+genetic and environmental risk factors. The disorder may occur as part
+of a heritable syndrome or in isolation (summary by Kuivaniemi et al.,
+2003).
+
+*RECORD*
+*FIELD* NO
+100100
+*FIELD* TI
+#100100 PRUNE BELLY SYNDROME; PBS
+;;ABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TRACT ABNORMALITY AND
+CRYPTORCHIDISM;;
+EAGLE-BARRETT SYNDROME; EGBRS
+*FIELD* TX
+
+A number sign (#) is used with this entry because of evidence that
+prune belly syndrome (PBS) is caused by homozygous mutation in the
+CHRM3 gene (118494) on chromosome 1q43. One such family has been
+reported.
+
+*RECORD*
+*FIELD* NO
+100500
+*FIELD* TI
+^100500 MOVED TO 100650
+*FIELD* TX
+
+fnord
+
+*RECORD*
+*FIELD* NO
+100640
+*FIELD* TI
+*100640 ALDEHYDE DEHYDROGENASE 1 FAMILY, MEMBER A1; ALDH1A1
+;;ALDEHYDE DEHYDROGENASE 1; ALDH1;;
+ACETALDEHYDE DEHYDROGENASE 1;;
+ALDH, LIVER CYTOSOLIC;;
+RETINAL DEHYDROGENASE 1; RALDH1
+*FIELD* TX
+
+DESCRIPTION
+
+The ALDH1A1 gene encodes a liver cytosolic isoform of acetaldehyde
+dehydrogenase (EC 1.2.1.3), an enzyme involved in the major pathway of
+alcohol metabolism after alcohol dehydrogenase (ADH, see 103700). See
+also liver mitochondrial ALDH2 (100650), variation in which has been
+implicated in different responses to alcohol ingestion.
+
+*RECORD*
+*FIELD* NO
+100650
+*FIELD* TI
+^100650 MOVED TO 200150
+*FIELD* TX
+
+This entry was incorporated into 200150 on March 2, 2004.
+
+*RECORD*
+*FIELD* NO
+100680
+*FIELD* TI
+^100680 MOVED TO 100740
+*FIELD* TX
+
+This entry was incorporated into entry 100740 on August 4, 2010.
+
+*RECORD*
+*FIELD* NO
+100740
+*FIELD* TI
+^100740 REMOVED FROM DATABASE
+*FIELD* TX
+
+fnord
+
+*RECORD*
+*FIELD* NO
+200150
+*FIELD* TI
++200150 CHOREOACANTHOCYTOSIS; CHAC
+;;LEVINE-CRITCHLEY SYNDROME;;
+ACANTHOCYTOSIS WITH NEUROLOGIC DISORDER;;
+NEUROACANTHOCYTOSIS;;
+CHOREA-ACANTHOCYTOSIS
+*FIELD* TX
+
+A number sign (#) is used with this entry because choreoacanthocytosis
+can be caused by homozygous or compound heterozygous mutation in the
+VPS13A gene (605978), which encodes chorein, on chromosome 9q21.
+
+DESCRIPTION
+
+Choreoacanthocytosis (CHAC) is a rare disorder characterized by
+progressive neurodegeneration and red cell acanthocytosis, with onset
+in the third to fifth decade of life (Rubio et al., 1997).
+*THEEND*
diff --git a/src/python/test/xrefs/parsers/flatfiles/mim2gene.txt b/src/python/test/xrefs/parsers/flatfiles/mim2gene.txt
new file mode 100644
index 000000000..dfbe2a148
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/mim2gene.txt
@@ -0,0 +1,10 @@
+#MIM number	GeneID	type	Source	MedGenCUI	Comment
+100050	-	phenotype	-	C3149220	-
+100070	-	phenotype	-	C1853365	-
+100100	1131	phenotype	 GeneMap	C0033770	question
+100200	-	phenotype	-	C4551519	-
+100300	57514	phenotype	 GeneMap	C4551482	-
+100600	-	phenotype	-	C0000889	-
+100640	216	gene	-	-	-
+100650	217	gene	-	-	-
+100660	218	gene	-	-	-
diff --git a/src/python/test/xrefs/parsers/flatfiles/mirbase.txt b/src/python/test/xrefs/parsers/flatfiles/mirbase.txt
new file mode 100644
index 000000000..43a5cef15
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/mirbase.txt
@@ -0,0 +1,506 @@
+ID   cel-let-7         standard; RNA; CEL; 99 BP.
+XX
+AC   MI0000001;
+XX
+DE   Caenorhabditis elegans let-7 stem-loop
+XX
+RN   [1]
+RX   PUBMED; 11679671.
+RA   Lau NC, Lim LP, Weinstein EG, Bartel DP;
+RT   "An abundant class of tiny RNAs with probable regulatory roles in
+RT   Caenorhabditis elegans";
+RL   Science. 294:858-862(2001).
+XX
+RN   [2]
+RX   PUBMED; 12672692.
+RA   Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB,
+RA   Bartel DP;
+RT   "The microRNAs of Caenorhabditis elegans";
+RL   Genes Dev. 17:991-1008(2003).
+XX
+RN   [3]
+RX   PUBMED; 12747828.
+RA   Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D;
+RT   "MicroRNAs and other tiny endogenous RNAs in C. elegans";
+RL   Curr Biol. 13:807-818(2003).
+XX
+RN   [4]
+RX   PUBMED; 12769849.
+RA   Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J;
+RT   "Computational and experimental identification of C. elegans microRNAs";
+RL   Mol Cell. 11:1253-1263(2003).
+XX
+RN   [5]
+RX   PUBMED; 17174894.
+RA   Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP;
+RT   "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and
+RT   endogenous siRNAs in C. elegans";
+RL   Cell. 127:1193-1207(2006).
+XX
+RN   [6]
+RX   PUBMED; 19460142.
+RA   Kato M, de Lencastre A, Pincus Z, Slack FJ;
+RT   "Dynamic expression of small non-coding RNAs, including novel microRNAs
+RT   and piRNAs/21U-RNAs, during Caenorhabditis elegans development";
+RL   Genome Biol. 10:R54(2009).
+XX
+RN   [7]
+RX   PUBMED; 20062054.
+RA   Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo
+RA   GW;
+RT   "Comprehensive discovery of endogenous Argonaute binding sites in
+RT   Caenorhabditis elegans";
+RL   Nat Struct Mol Biol. 17:173-179(2010).
+XX
+DR   RFAM; RF00027; let-7.
+DR   WORMBASE; C05G5/12462-12364; .
+XX
+CC   let-7 is found on chromosome X in Caenorhabditis elegans [1] and pairs to
+CC   sites within the 3' untranslated region (UTR) of target mRNAs, specifying
+CC   the translational repression of these mRNAs and triggering the transition
+CC   to late-larval and adult stages [2].
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   miRNA           17..38
+FT                   /accession="MIMAT0000001"
+FT                   /product="cel-let-7-5p"
+FT                   /evidence=experimental
+FT                   /experiment="cloned [1-3], Northern [1], PCR [4], 454 [5],
+FT                   Illumina [6], CLIPseq [7]"
+FT   miRNA           60..81
+FT                   /accession="MIMAT0015091"
+FT                   /product="cel-let-7-3p"
+FT                   /evidence=experimental
+FT                   /experiment="CLIPseq [7]"
+XX
+SQ   Sequence 99 BP; 26 A; 19 C; 24 G; 0 T; 30 other;
+     uacacugugg auccggugag guaguagguu guauaguuug gaauauuacc accggugaac        60
+     uaugcaauuu ucuaccuuac cggagacaga acucuucga                               99
+//
+ID   cel-lin-4         standard; RNA; CEL; 94 BP.
+XX
+AC   MI0000002;
+XX
+DE   Caenorhabditis elegans lin-4 stem-loop
+XX
+RN   [1]
+RX   PUBMED; 11679671.
+RA   Lau NC, Lim LP, Weinstein EG, Bartel DP;
+RT   "An abundant class of tiny RNAs with probable regulatory roles in
+RT   Caenorhabditis elegans";
+RL   Science. 294:858-862(2001).
+XX
+RN   [2]
+RX   PUBMED; 10642801.
+RA   Olsen PH, Ambros V;
+RT   "The lin-4 regulatory RNA controls developmental timing in Caenorhabditis
+RT   elegans  by blocking LIN-14 protein synthesis after the initiation of
+RT   translation";
+RL   Dev Biol. 216:671-680(1999).
+XX
+RN   [3]
+RX   PUBMED; 12672692.
+RA   Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB,
+RA   Bartel DP;
+RT   "The microRNAs of Caenorhabditis elegans";
+RL   Genes Dev. 17:991-1008(2003).
+XX
+RN   [4]
+RX   PUBMED; 12747828.
+RA   Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D;
+RT   "MicroRNAs and other tiny endogenous RNAs in C. elegans";
+RL   Curr Biol. 13:807-818(2003).
+XX
+RN   [5]
+RX   PUBMED; 17174894.
+RA   Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP;
+RT   "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and
+RT   endogenous siRNAs in C. elegans";
+RL   Cell. 127:1193-1207(2006).
+XX
+RN   [6]
+RX   PUBMED; 19460142.
+RA   Kato M, de Lencastre A, Pincus Z, Slack FJ;
+RT   "Dynamic expression of small non-coding RNAs, including novel microRNAs
+RT   and piRNAs/21U-RNAs, during Caenorhabditis elegans development";
+RL   Genome Biol. 10:R54(2009).
+XX
+RN   [7]
+RX   PUBMED; 20062054.
+RA   Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo
+RA   GW;
+RT   "Comprehensive discovery of endogenous Argonaute binding sites in
+RT   Caenorhabditis elegans";
+RL   Nat Struct Mol Biol. 17:173-179(2010).
+XX
+DR   RFAM; RF00052; lin-4.
+DR   WORMBASE; F59G1/6156-6249; .
+XX
+CC   lin-4 is found on chromosome II in Caenorhabditis elegans [1] and is
+CC   complementary to sequences in the 3' untranslated region (UTR) of lin-14
+CC   mRNA. lin-4 acts to developmentally repress the accumulation of lin-14
+CC   protein. This repression is essential for the proper timing of numerous
+CC   events of Caenorhabditis elegans larval development [2].
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   miRNA           16..36
+FT                   /accession="MIMAT0000002"
+FT                   /product="cel-lin-4-5p"
+FT                   /evidence=experimental
+FT                   /experiment="cloned [1,3-4], 454 [5], Illumina [6],
+FT                   CLIPseq [7]"
+FT   miRNA           55..76
+FT                   /accession="MIMAT0015092"
+FT                   /product="cel-lin-4-3p"
+FT                   /evidence=experimental
+FT                   /experiment="CLIPseq [7]"
+XX
+SQ   Sequence 94 BP; 17 A; 25 C; 26 G; 0 T; 26 other;
+     augcuuccgg ccuguucccu gagaccucaa gugugagugu acuauugaug cuucacaccu        60
+     gggcucuccg gguaccagga cgguuugagc agau                                    94
+//
+ID   cel-mir-1         standard; RNA; CEL; 96 BP.
+XX
+AC   MI0000003;
+XX
+DE   Caenorhabditis elegans miR-1 stem-loop
+XX
+RN   [1]
+RX   PUBMED; 11679671.
+RA   Lau NC, Lim LP, Weinstein EG, Bartel DP;
+RT   "An abundant class of tiny RNAs with probable regulatory roles in
+RT   Caenorhabditis elegans";
+RL   Science. 294:858-862(2001).
+XX
+RN   [2]
+RX   PUBMED; 11679672.
+RA   Lee RC, Ambros V;
+RT   "An extensive class of small RNAs in Caenorhabditis elegans";
+RL   Science. 294:862-864(2001).
+XX
+RN   [3]
+RX   PUBMED; 11679670.
+RA   Lagos-Quintana M, Rauhut R, Lendeckel W, Tuschl T;
+RT   "Identification of novel genes coding for small expressed RNAs";
+RL   Science. 294:853-858(2001).
+XX
+RN   [4]
+RX   PUBMED; 12672692.
+RA   Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB,
+RA   Bartel DP;
+RT   "The microRNAs of Caenorhabditis elegans";
+RL   Genes Dev. 17:991-1008(2003).
+XX
+RN   [5]
+RX   PUBMED; 12747828.
+RA   Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D;
+RT   "MicroRNAs and other tiny endogenous RNAs in C. elegans";
+RL   Curr Biol. 13:807-818(2003).
+XX
+RN   [6]
+RX   PUBMED; 12769849.
+RA   Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J;
+RT   "Computational and experimental identification of C. elegans microRNAs";
+RL   Mol Cell. 11:1253-1263(2003).
+XX
+RN   [7]
+RX   PUBMED; 17174894.
+RA   Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP;
+RT   "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and
+RT   endogenous siRNAs in C. elegans";
+RL   Cell. 127:1193-1207(2006).
+XX
+RN   [8]
+RX   PUBMED; 19460142.
+RA   Kato M, de Lencastre A, Pincus Z, Slack FJ;
+RT   "Dynamic expression of small non-coding RNAs, including novel microRNAs
+RT   and piRNAs/21U-RNAs, during Caenorhabditis elegans development";
+RL   Genome Biol. 10:R54(2009).
+XX
+RN   [9]
+RX   PUBMED; 20062054.
+RA   Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo
+RA   GW;
+RT   "Comprehensive discovery of endogenous Argonaute binding sites in
+RT   Caenorhabditis elegans";
+RL   Nat Struct Mol Biol. 17:173-179(2010).
+XX
+RN   [10]
+RX   PUBMED; 21307183.
+RA   Warf MB, Johnson WE, Bass BL;
+RT   "Improved annotation of C. elegans microRNAs by deep sequencing reveals
+RT   structures associated with processing by Drosha and Dicer";
+RL   RNA. 17:563-577(2011).
+XX
+DR   RFAM; RF00103; mir-1.
+DR   WORMBASE; T09B4/23107-23012; .
+XX
+CC   miR-1 was independently identified in C. elegans [1,2] and Drosophila
+CC   melanogaster (MIR:MI0000116) [3].  The sequence is also conserved in C.
+CC   briggsae (MIR:MI0000493).
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   miRNA           21..42
+FT                   /accession="MIMAT0020301"
+FT                   /product="cel-miR-1-5p"
+FT                   /evidence=experimental
+FT                   /experiment="Illumina [10]"
+FT   miRNA           61..81
+FT                   /accession="MIMAT0000003"
+FT                   /product="cel-miR-1-3p"
+FT                   /evidence=experimental
+FT                   /experiment="cloned [1-2,4-5], Northern [1], Illumina
+FT                   [10,8], PCR [6], 454 [7], CLIPseq [9]"
+XX
+SQ   Sequence 96 BP; 32 A; 16 C; 23 G; 0 T; 25 other;
+     aaagugaccg uaccgagcug cauacuuccu uacaugccca uacuauauca uaaauggaua        60
+     uggaauguaa agaaguaugu agaacggggu gguagu                                  96
+//
+ID   cel-mir-2         standard; RNA; CEL; 98 BP.
+XX
+AC   MI0000004;
+XX
+DE   Caenorhabditis elegans miR-2 stem-loop
+XX
+RN   [1]
+RX   PUBMED; 11679671.
+RA   Lau NC, Lim LP, Weinstein EG, Bartel DP;
+RT   "An abundant class of tiny RNAs with probable regulatory roles in
+RT   Caenorhabditis elegans";
+RL   Science. 294:858-862(2001).
+XX
+RN   [2]
+RX   PUBMED; 11679672.
+RA   Lee RC, Ambros V;
+RT   "An extensive class of small RNAs in Caenorhabditis elegans";
+RL   Science. 294:862-864(2001).
+XX
+RN   [3]
+RX   PUBMED; 12672692.
+RA   Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB,
+RA   Bartel DP;
+RT   "The microRNAs of Caenorhabditis elegans";
+RL   Genes Dev. 17:991-1008(2003).
+XX
+RN   [4]
+RX   PUBMED; 12747828.
+RA   Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D;
+RT   "MicroRNAs and other tiny endogenous RNAs in C. elegans";
+RL   Curr Biol. 13:807-818(2003).
+XX
+RN   [5]
+RX   PUBMED; 12769849.
+RA   Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J;
+RT   "Computational and experimental identification of C. elegans microRNAs";
+RL   Mol Cell. 11:1253-1263(2003).
+XX
+RN   [6]
+RX   PUBMED; 17174894.
+RA   Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP;
+RT   "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and
+RT   endogenous siRNAs in C. elegans";
+RL   Cell. 127:1193-1207(2006).
+XX
+RN   [7]
+RX   PUBMED; 19460142.
+RA   Kato M, de Lencastre A, Pincus Z, Slack FJ;
+RT   "Dynamic expression of small non-coding RNAs, including novel microRNAs
+RT   and piRNAs/21U-RNAs, during Caenorhabditis elegans development";
+RL   Genome Biol. 10:R54(2009).
+XX
+RN   [8]
+RX   PUBMED; 20062054.
+RA   Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo
+RA   GW;
+RT   "Comprehensive discovery of endogenous Argonaute binding sites in
+RT   Caenorhabditis elegans";
+RL   Nat Struct Mol Biol. 17:173-179(2010).
+XX
+RN   [9]
+RX   PUBMED; 21307183.
+RA   Warf MB, Johnson WE, Bass BL;
+RT   "Improved annotation of C. elegans microRNAs by deep sequencing reveals
+RT   structures associated with processing by Drosha and Dicer";
+RL   RNA. 17:563-577(2011).
+XX
+DR   RFAM; RF00047; mir-2.
+DR   WORMBASE; M04C9/29652-29555; .
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   miRNA           20..41
+FT                   /accession="MIMAT0020302"
+FT                   /product="cel-miR-2-5p"
+FT                   /evidence=experimental
+FT                   /experiment="Illumina [9]"
+FT   miRNA           61..83
+FT                   /accession="MIMAT0000004"
+FT                   /product="cel-miR-2-3p"
+FT                   /evidence=experimental
+FT                   /experiment="cloned [1-4], PCR [5], 454 [6], Illumina
+FT                   [7,9], CLIPseq [8]"
+XX
+SQ   Sequence 98 BP; 27 A; 19 C; 22 G; 0 T; 30 other;
+     uaaacaguau acagaaagcc aucaaagcgg ugguugaugu guugcaaauu augacuuuca        60
+     uaucacagcc agcuuugaug ugcugccugu ugcacugu                                98
+//
+ID   cel-mir-34        standard; RNA; CEL; 97 BP.
+XX
+AC   MI0000005;
+XX
+DE   Caenorhabditis elegans miR-34 stem-loop
+XX
+RN   [1]
+RX   PUBMED; 11679671.
+RA   Lau NC, Lim LP, Weinstein EG, Bartel DP;
+RT   "An abundant class of tiny RNAs with probable regulatory roles in
+RT   Caenorhabditis elegans";
+RL   Science. 294:858-862(2001).
+XX
+RN   [2]
+RX   PUBMED; 12672692.
+RA   Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB,
+RA   Bartel DP;
+RT   "The microRNAs of Caenorhabditis elegans";
+RL   Genes Dev. 17:991-1008(2003).
+XX
+RN   [3]
+RX   PUBMED; 12747828.
+RA   Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D;
+RT   "MicroRNAs and other tiny endogenous RNAs in C. elegans";
+RL   Curr Biol. 13:807-818(2003).
+XX
+RN   [4]
+RX   PUBMED; 12769849.
+RA   Grad Y, Aach J, Hayes GD, Reinhart BJ, Church GM, Ruvkun G, Kim J;
+RT   "Computational and experimental identification of C. elegans microRNAs";
+RL   Mol Cell. 11:1253-1263(2003).
+XX
+RN   [5]
+RX   PUBMED; 17174894.
+RA   Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP;
+RT   "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and
+RT   endogenous siRNAs in C. elegans";
+RL   Cell. 127:1193-1207(2006).
+XX
+RN   [6]
+RX   PUBMED; 19460142.
+RA   Kato M, de Lencastre A, Pincus Z, Slack FJ;
+RT   "Dynamic expression of small non-coding RNAs, including novel microRNAs
+RT   and piRNAs/21U-RNAs, during Caenorhabditis elegans development";
+RL   Genome Biol. 10:R54(2009).
+XX
+RN   [7]
+RX   PUBMED; 20062054.
+RA   Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo
+RA   GW;
+RT   "Comprehensive discovery of endogenous Argonaute binding sites in
+RT   Caenorhabditis elegans";
+RL   Nat Struct Mol Biol. 17:173-179(2010).
+XX
+DR   WORMBASE; Y41G9A/23565-23469; .
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   miRNA           16..37
+FT                   /accession="MIMAT0000005"
+FT                   /product="cel-miR-34-5p"
+FT                   /evidence=experimental
+FT                   /experiment="cloned [1-3], Northern [1], PCR [4], 454 [5],
+FT                   Illumina [6], CLIPseq [7]"
+FT   miRNA           53..74
+FT                   /accession="MIMAT0015093"
+FT                   /product="cel-miR-34-3p"
+FT                   /evidence=experimental
+FT                   /experiment="CLIPseq [7]"
+XX
+SQ   Sequence 97 BP; 21 A; 27 C; 23 G; 0 T; 26 other;
+     cggacaaugc ucgagaggca gugugguuag cugguugcau auuuccuuga caacggcuac        60
+     cuucacugcc accccgaaca ugucguccau cuuugaa                                 97
+//
+ID   cel-mir-35        standard; RNA; CEL; 97 BP.
+XX
+AC   MI0000006;
+XX
+DE   Caenorhabditis elegans miR-35 stem-loop
+XX
+RN   [1]
+RX   PUBMED; 11679671.
+RA   Lau NC, Lim LP, Weinstein EG, Bartel DP;
+RT   "An abundant class of tiny RNAs with probable regulatory roles in
+RT   Caenorhabditis elegans";
+RL   Science. 294:858-862(2001).
+XX
+RN   [2]
+RX   PUBMED; 12672692.
+RA   Lim LP, Lau NC, Weinstein EG, Abdelhakim A, Yekta S, Rhoades MW, Burge CB,
+RA   Bartel DP;
+RT   "The microRNAs of Caenorhabditis elegans";
+RL   Genes Dev. 17:991-1008(2003).
+XX
+RN   [3]
+RX   PUBMED; 12747828.
+RA   Ambros V, Lee RC, Lavanway A, Williams PT, Jewell D;
+RT   "MicroRNAs and other tiny endogenous RNAs in C. elegans";
+RL   Curr Biol. 13:807-818(2003).
+XX
+RN   [4]
+RX   PUBMED; 17174894.
+RA   Ruby JG, Jan C, Player C, Axtell MJ, Lee W, Nusbaum C, Ge H, Bartel DP;
+RT   "Large-scale sequencing reveals 21U-RNAs and additional microRNAs and
+RT   endogenous siRNAs in C. elegans";
+RL   Cell. 127:1193-1207(2006).
+XX
+RN   [5]
+RX   PUBMED; 19460142.
+RA   Kato M, de Lencastre A, Pincus Z, Slack FJ;
+RT   "Dynamic expression of small non-coding RNAs, including novel microRNAs
+RT   and piRNAs/21U-RNAs, during Caenorhabditis elegans development";
+RL   Genome Biol. 10:R54(2009).
+XX
+RN   [6]
+RX   PUBMED; 20062054.
+RA   Zisoulis DG, Lovci MT, Wilbert ML, Hutt KR, Liang TY, Pasquinelli AE, Yeo
+RA   GW;
+RT   "Comprehensive discovery of endogenous Argonaute binding sites in
+RT   Caenorhabditis elegans";
+RL   Nat Struct Mol Biol. 17:173-179(2010).
+XX
+RN   [7]
+RX   PUBMED; 21307183.
+RA   Warf MB, Johnson WE, Bass BL;
+RT   "Improved annotation of C. elegans microRNAs by deep sequencing reveals
+RT   structures associated with processing by Drosha and Dicer";
+RL   RNA. 17:563-577(2011).
+XX
+DR   WORMBASE; Y62F5A/16840-16936; .
+XX
+FH   Key             Location/Qualifiers
+FH
+FT   miRNA           22..44
+FT                   /accession="MIMAT0020303"
+FT                   /product="cel-miR-35-5p"
+FT                   /evidence=experimental
+FT                   /experiment="Illumina [7]"
+FT   miRNA           61..82
+FT                   /accession="MIMAT0000006"
+FT                   /product="cel-miR-35-3p"
+FT                   /evidence=experimental
+FT                   /experiment="cloned [1-3], Northern [1], 454 [4], Illumina
+FT                   [5,7], CLIPseq [6]"
+XX
+SQ   Sequence 97 BP; 20 A; 25 C; 22 G; 0 T; 30 other;
+     ucucggauca gaucgagcca uugcugguuu cuuccacagu gguacuuucc auuagaacua        60
+     ucaccgggug gaaacuagca guggcucgau cuuuucc                                 97
+//
+ID   cel-mir-36        standard; RNA; CEL; 97 BP.
+XX
+AC   MI0000007;
+XX
+SQ   Sequence 97 BP; 20 A; 25 C; 22 G; 0 T; 30 other;
+     ucucggauca gaucgagcca uugcugguuu cuuccacagu gguacuuucc auuagaacua        60
+     ucaccgggug gaaacuagca guggcucgau cuuuucc                                 97
+//
diff --git a/src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt b/src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt
new file mode 100644
index 000000000..3905bbc4a
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/reactome_UniProt.txt
@@ -0,0 +1,8 @@
+A0A075B6P5	R-HSA-109582	https://reactome.org/PathwayBrowser/#/R-HSA-109582	Hemostasis	TAS	Homo sapiens
+A0A075B6S6	R-HSA-1280218	https://reactome.org/PathwayBrowser/#/R-HSA-1280218	Adaptive Immune System	TAS	Homo sapiens
+A0A075B7I6	R-HSA-1280218	https://reactome.org/PathwayBrowser/#/R-HSA-1280218	Adaptive Immune System	IEA	Homo sapiens
+A0A087WPF7	R-HSA-1643685	https://reactome.org/PathwayBrowser/#/R-HSA-1643685	Disease	TAS	Homo sapiens
+A0A087WPF7	R-HSA-1643685	https://reactome.org/PathwayBrowser/#/R-HSA-1643685	Disease	IEA	Homo sapiens
+A0A087WRR7	R-HSA-166658	https://reactome.org/PathwayBrowser/#/R-HSA-166658	Complement cascade	TAS	Homo sapiens
+A0A096LNF2	R-HSA-166663	https://reactome.org/PathwayBrowser/#/R-HSA-166663	Initial triggering of complement	TAS	Homo sapiens
+A0A096MK16	R-HSA-166786	https://reactome.org/PathwayBrowser/#/R-HSA-166786	Creation of C4 and C2 activators	TAS	Homo sapiens
diff --git a/src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt b/src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt
new file mode 100644
index 000000000..6b5bb0c3d
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/reactome_ensembl.txt
@@ -0,0 +1,14 @@
+ENSG00000000419	R-HSA-162699	https://reactome.org/PathwayBrowser/#/R-HSA-162699	Synthesis of dolichyl-phosphate mannose	TAS	Homo sapiens
+ENSG00000000419	R-HSA-163125	https://reactome.org/PathwayBrowser/#/R-HSA-163125	Post-translational modification: synthesis of GPI-anchored proteins	TAS	Homo sapiens
+ENSG00000000419	R-HSA-1643685	https://reactome.org/PathwayBrowser/#/R-HSA-1643685	Disease	TAS	Homo sapiens
+ENSG00000000419	R-HSA-3781865	https://reactome.org/PathwayBrowser/#/R-HSA-3781865	Diseases of glycosylation	TAS	Homo sapiens
+ENSG00000000419	R-HSA-392499	https://reactome.org/PathwayBrowser/#/R-HSA-392499	Metabolism of proteins	TAS	Homo sapiens
+ENSX00000000419	R-HSA-446193	https://reactome.org/PathwayBrowser/#/R-HSA-446193	Biosynthesis of the N-glycan precursor (dolichol lipid-linked oligosaccharide, LLO) and transfer to a nascent protein	TAS	Homo sapiens
+ENSG00000000419	R-HSA-446203	https://reactome.org/PathwayBrowser/#/R-HSA-446203	Asparagine N-linked glycosylation	TAS	Homo sapiens
+ENST00000000233	R-HSA-199977	https://reactome.org/PathwayBrowser/#/R-HSA-199977	ER to Golgi Anterograde Transport	TAS	Homo sapiens
+ENST00000000233	R-HSA-199991	https://reactome.org/PathwayBrowser/#/R-HSA-199991	Membrane Trafficking	TAS	Homo sapiens
+ENST00000000233	R-HSA-392499	https://reactome.org/PathwayBrowser/#/R-HSA-392499	Metabolism of proteins	TAS	Homo sapiens
+ENST00000000233	R-HSA-446203	https://reactome.org/PathwayBrowser/#/R-HSA-446203	Asparagine N-linked glycosylation	TAS	Homo sapiens
+ENSP00000000233	R-HSA-199977	https://reactome.org/PathwayBrowser/#/R-HSA-199977	ER to Golgi Anterograde Transport	TAS	Homo sapiens
+ENSP00000000233	R-HSA-199991	https://reactome.org/PathwayBrowser/#/R-HSA-199991	Membrane Trafficking	TAS	Homo sapiens
+ENSP00000000233	R-HSA-392499	https://reactome.org/PathwayBrowser/#/R-HSA-392499	Metabolism of proteins	TAS	Homo sapiens
diff --git a/src/python/test/xrefs/parsers/flatfiles/reactome_release.txt b/src/python/test/xrefs/parsers/flatfiles/reactome_release.txt
new file mode 100644
index 000000000..9f7285879
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/reactome_release.txt
@@ -0,0 +1 @@
+88
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt b/src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt
new file mode 100644
index 000000000..ce94b07e6
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/refseq_protein.txt
@@ -0,0 +1,291 @@
+LOCUS       NP_001355183             382 aa            linear   PRI 26-JUN-2020
+DEFINITION  killer cell immunoglobulin-like receptor 3DS1-like precursor [Homo
+            sapiens].
+ACCESSION   NP_001355183 XP_024308382
+VERSION     NP_001355183.1
+DBSOURCE    REFSEQ: accession NM_001368254.1
+KEYWORDS    RefSeq; RefSeq Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..382
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="19"
+                     /map="19"
+     CDS             1..382
+                     /gene="LOC112268355"
+                     /coded_by="NM_001368254.1:47..1195"
+                     /db_xref="GeneID:112268355"
+ORIGIN      
+        1 mllmvvsmac vglflvqrag phmggqdkpf lsawpsavvp rgghvtlrch yrhrfnnfml
+       61 ykedrihvpi fhgrifqegf nmspvttaha gnytcrgshp hsptgwsaps npmvimvtgn
+      121 hrwcsnkkkc ccngpracre qk
+//
+LOCUS       NP_001337906              44 aa            linear   PRI 01-JUL-2020
+DEFINITION  putative keratin-associated protein 20-4 [Homo sapiens].
+ACCESSION   NP_001337906
+VERSION     NP_001337906.1
+DBSOURCE    REFSEQ: accession NM_001350977.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..44
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="21"
+                     /map="21q22.11"
+     CDS             1..44
+                     /gene="KRTAP20-4"
+                     /gene_synonym="KAP20.4"
+                     /coded_by="NM_001350977.1:32..166"
+                     /db_xref="CCDS:CCDS86982.1"
+                     /db_xref="GeneID:100151643"
+                     /db_xref="HGNC:HGNC:34002"
+ORIGIN      
+        1 msyyshlsgg lgcglavavt mgrtvavaey grcrhgchss ysar
+//
+LOCUS       XP_001243796             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin specific peptidase 17 like family member 30 [Homo
+            sapiens].
+ACCESSION   XP_001243796 XP_001130476 XP_003403824
+VERSION     XP_001243796.1
+DBSOURCE    REFSEQ: accession NM_001256867.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="4"
+                     /map="4p16.1"
+     CDS             1..530
+                     /gene="USP17L30"
+                     /coded_by="NM_001256867.1:1..1593"
+                     /db_xref="CCDS:CCDS59471.1"
+                     /db_xref="GeneID:728419"
+                     /db_xref="HGNC:HGNC:44458"
+ORIGIN      
+        1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh
+      121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh
+      181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql
+      241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec
+      301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp
+      481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq
+//
+LOCUS       NP_001229257             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin specific peptidase 17 like family member 26 [Homo
+            sapiens].
+ACCESSION   NP_001229257 XP_001130428 XP_001721948
+VERSION     NP_001229257.1
+DBSOURCE    REFSEQ: accession NP_001229259.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="4"
+                     /map="4p16.1"
+     CDS             1..530
+                     /gene="USP17L26"
+                     /coded_by="NP_001229259.1:1..1593"
+                     /db_xref="CCDS:CCDS59466.1"
+                     /db_xref="GeneID:728379"
+                     /db_xref="HGNC:HGNC:44454"
+ORIGIN      
+        1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh
+      121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh
+      181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql
+      241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec
+      301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp
+      481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq
+//
+LOCUS       XP_001243802             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin carboxyl-terminal hydrolase 17-like protein 1 [Homo
+            sapiens].
+ACCESSION   XP_001243802 XP_006725126 XP_011544822
+VERSION     XP_001243802.1
+DBSOURCE    REFSEQ: accession NM_001256873.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="8"
+                     /map="8p23.1"
+     CDS             1..530
+                     /gene="USP17L1"
+                     /gene_synonym="USP17L1P"
+                     /coded_by="NM_001256873.1:1..1593"
+                     /db_xref="CCDS:CCDS78298.1"
+                     /db_xref="GeneID:401447"
+                     /db_xref="HGNC:HGNC:37182"
+ORIGIN      
+        1 mgddslylgg ewqfnhfskl tssrpdaafa eiqrtslpek splssetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy enaslqclty tlplanymls rehsqtcqrp
+      121 kccmlctmqa hitwalhspg hviqpsqala agfhrgkqed vheflmftvd amkkaclpgh
+      181 kqvdhhckdt tlihqifggc wrsqikclhc hgisdtfdpy ldialdiqaa qsvkqaleql
+      241 vkpeelngen ayhcglclqr apasntltlh tsakvlilvl krfsdvagnk laknvqypec
+      301 ldmqpymsqq ntgplvyvly avlvhagwsc hdghyfsyvk aqevqwykmd daevtvcsii
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rrakqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvgkvegtlp pnalvihqsk ykcgmknhhp
+      481 eqqssllnls sttrtdqesm ntgtlaslqg rtrrakgknk hskrallvcq
+//
+LOCUS       XP_001229255             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin specific peptidase 17 like family member 25 [Homo
+            sapiens].
+ACCESSION   XP_001229255 XP_001130417
+VERSION     XP_001229255.1
+DBSOURCE    REFSEQ: accession NM_001242326.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="4"
+                     /map="4p16.1"
+     CDS             1..530
+                     /gene="USP17L25"
+                     /coded_by="NM_001242326.1:1..1593"
+                     /db_xref="CCDS:CCDS59465.1"
+                     /db_xref="GeneID:728373"
+                     /db_xref="HGNC:HGNC:44452"
+ORIGIN      
+        1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh
+      121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh
+      181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql
+      241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec
+      301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp
+      481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq
+//
+LOCUS       NP_001229261             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin specific peptidase 17 like family member 29 [Homo
+            sapiens].
+ACCESSION   NP_001229261 XP_001130464
+VERSION     NP_001229261.1
+DBSOURCE    REFSEQ: accession NM_001242332.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="4"
+                     /map="4p16.1"
+     CDS             1..530
+                     /gene="USP17L29"
+                     /coded_by="NM_001242332.1:1..1593"
+                     /db_xref="CCDS:CCDS59470.1"
+                     /db_xref="GeneID:728405"
+                     /db_xref="HGNC:HGNC:44457"
+ORIGIN      
+        1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh
+      121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh
+      181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql
+      241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec
+      301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp
+      481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq
+//
+LOCUS       NP_001229260             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin specific peptidase 17 like family member 28 [Homo
+            sapiens].
+ACCESSION   NP_001229260 XP_001130452
+VERSION     NP_001229260.1
+DBSOURCE    REFSEQ: accession NM_001242331.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Bos taurus (cow)
+  ORGANISM  Bos taurus
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Bos taurus"
+                     /db_xref="taxon:9913"
+                     /chromosome="4"
+                     /map="4p16.1"
+     CDS             1..530
+                     /gene="USP17L28"
+                     /coded_by="NM_001242331.1:1..1593"
+                     /db_xref="CCDS:CCDS59469.1"
+                     /db_xref="GeneID:728400"
+                     /db_xref="HGNC:HGNC:44456"
+ORIGIN      
+        1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh
+      121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh
+      181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql
+      241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec
+      301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp
+      481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq
+//
+LOCUS       NP_001229259             530 aa            linear   PRI 01-JUL-2020
+DEFINITION  ubiquitin specific peptidase 17 like family member 27 [Homo
+            sapiens].
+ACCESSION   NP_001229259 XP_001130444
+VERSION     NP_001229259.1
+DBSOURCE    REFSEQ: accession NM_001242328.1
+KEYWORDS    RefSeq; MANE Select.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..530
+                     /organism="Homo sapiens"
+                     /db_xref="taxon:9606"
+                     /chromosome="4"
+                     /map="4p16.1"
+     CDS             1..530
+                     /gene="USP17L27"
+                     /coded_by="NM_001242328.1:1..1593"
+                     /db_xref="CCDS:CCDS59468.1"
+                     /db_xref="GeneID:728393"
+                     /db_xref="HGNC:HGNC:44455"
+ORIGIN      
+        1 meddslylrg ewqfnhfskl tssrpdaafa eiqrtslpek splscetrvd lcddlapvar
+       61 qlapreklpl ssrrpaavga glqnmgntcy vnaslqclty tpplanymls rehsqtchrh
+      121 kgcmlctmqa hitralhnpg hviqpsqala agfhrgkqed aheflmftvd amkkaclpgh
+      181 kqvdhhskdt tlihqifggy wrsqikclhc hgisdtfdpy ldialdiqaa qsvqqaleql
+      241 vkpeelngen ayhcgvclqr apasktltlh tsakvlilvl krfsdvtgnk iaknvqypec
+      301 ldmqpymsqp ntgplvyvly avlvhagwsc hnghyfsyvk aqegqwykmd daevtassit
+      361 svlsqqayvl fyiqksewer hsesvsrgre pralgaedtd rratqgelkr dhpclqapel
+      421 dehlveratq estldhwkfl qeqnktkpef nvrkvegtlp pdvlvihqsk ykcgmknhhp
+      481 eqqssllnls sstpthqesm ntgtlaslrg rarrskgknk hskrallvcq
+//
diff --git a/src/python/test/xrefs/parsers/flatfiles/refseq_release.txt b/src/python/test/xrefs/parsers/flatfiles/refseq_release.txt
new file mode 100644
index 000000000..ecaeeb1e8
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/refseq_release.txt
@@ -0,0 +1,94 @@
+********************************************************************************
+RefSeq-release224.txt       ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/
+
+		NCBI Reference Sequence (RefSeq) Database
+
+			Release 224
+			May 6, 2024
+
+		Distribution Release Notes
+
+Release Size: 
+   150742 organisms
+   4379003578168 nucleotide bases
+   126991769080 amino acids
+   435879646 records
+******************************************************************************
+
+This document describes the format and content of the flat files that 
+comprise releases of the NCBI Reference Sequence (RefSeq) database.
+
+Additional information about RefSeq is available at:
+
+1. NCBI Bookshelf:
+   a) NCBI Handbook:  
+   https://www.ncbi.nlm.nih.gov/books/NBK21091/
+   b) RefSeq Help (FAQ)
+   https://www.ncbi.nlm.nih.gov/books/NBK50680/
+ 
+2. RefSeq Web Sites: 
+   RefSeq Home:  https://www.ncbi.nlm.nih.gov/refseq/
+   RefSeqGene Home: https://www.ncbi.nlm.nih.gov/refseq/rsg/
+
+If you have any questions or comments about RefSeq, the RefSeq release files
+or this document, please contact NCBI by email at:
+   info@ncbi.nlm.nih.gov. 
+
+To receive announcements of future RefSeq releases and large updates please
+subscribe to NCBI's refseq-announce mail list:
+
+   send email to refseq-announce-subscribe@ncbi.nlm.nih.gov
+   with "subscribe" in the subject line (without quotes)
+   and nothing in the email body
+
+OR
+
+subscribe using the web interface at:
+   https://www.ncbi.nlm.nih.gov/mailman/listinfo/refseq-announce
+
+=============================================================================
+TABLE OF CONTENTS
+=============================================================================
+1. INTRODUCTION	 
+	1.1 This release
+	1.2 Cutoff date
+	1.3 RefSeq Project Background
+		1.3.1 Sequence accessions, validation, and annotations
+		1.3.2 Data assembly, curation, and collaboration 
+		1.3.3 Biologically non-redundant data set
+		1.3.4 RefSeq and DDBJ/EMBL/GenBank comparison
+	1.4 Uses and applications of the RefSeq database
+2. CONTENT
+	2.1 Organisms included
+	2.2 Molecule Types included
+	2.3 Known Problems, Redundancies, and Inconsistencies
+	2.4 Release Catalog
+	2.5 Changes since the previous release 
+3. ORGANIZATION OF DATA FILES
+	3.1 FTP Site Organization
+	3.2 Release Contents
+	3.3 File Names and Formats
+        3.4 File Sizes
+        3.5 Statistics
+	3.6 Release Catalog
+	3.7 Removed Records
+      	3.8 Accession Format
+        3.9 Growth of RefSeq        
+4. FLAT FILE ANNOTATION
+	4.1 Main features of RefSeq Flat File
+		4.1.1 LOCUS, DEFLINE, ACCESSION, KEYWORDS, SOURCE, ORGANISM
+		4.1.2 REFERENCE, DIRECT SUBMISSION, COMMENT, PRIMARY
+		4.1.3 NUCLEOTIDE FEATURE ANNOTATION (Gene, mRNA, CDS)
+		4.1.4 PROTEIN FEATURE ANNOTATION
+	4.2 Tracking Identifiers
+		4.2.1 GeneID
+		4.2.2 Transcript ID
+		4.2.3 Protein ID
+		4.2.4 Conserved Domain Database (CDD) ID
+5. REFSEQ ADMINISTRATION
+	5.1 Citing RefSeq
+	5.2 RefSeq Distribution Formats
+	5.3 Other Methods of Accessing RefSeq Data
+	5.4 Request for Corrections and Comments
+	5.5 Credits and Acknowledgements
+	5.6 Disclaimer
diff --git a/src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt b/src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt
new file mode 100644
index 000000000..da86f9d30
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/refseq_rna.txt
@@ -0,0 +1,508 @@
+LOCUS       NR_168385               3420 bp    RNA     linear   PRI 13-MAY-2020
+DEFINITION  Homo sapiens LOC105373289 (LOC105373289), transcript variant 6,
+            long non-coding RNA.
+ACCESSION   NR_168385
+VERSION     NR_168385.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..3420
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="1"
+                     /map="1q42.13"
+     gene            1..3420
+                     /gene="LOC105373289"
+                     /note="LOC105373289"
+                     /db_xref="GeneID:105373289"
+     ncRNA           1..3420
+                     /ncRNA_class="lncRNA"
+                     /gene="LOC105373289"
+                     /product="LOC105373289, transcript variant 6"
+                     /db_xref="GeneID:105373289"
+ORIGIN      
+        1 agcagggcgt ccagcggaga aggcagagga ggggagatgc gggctcctcc aggtagcgca
+       61 ggagcccctc cggctgccgg agccccgcga gggcgcgagt ggagggcagg agcccgggcg
+      121 gcggaggagc ggaagggatg ctgcgttgcc ttggagtgtc agggtggggg aggaaagacc
+      181 aagggaccca cgtccttcgc ccccgccgcg gagtcccggg ccggcgagac ttccgcagcc
+      241 tgcccagcgc cggggaccta gggctttgca ggagtccgcc cgggagctct atcagagcgg
+      301 gcgtcctccc cgccgctcca aaggtggctt ggggcaggtg gggcgtcccg gagggaatgg
+      361 agggaccctg cctagggaag gagggatttc gctgcctgtg gggcttcagt gctgaaccag
+      421 gcagccctga gcagaccagg accgagcttc ccaaacctga ccgggaagga gccctggttg
+      481 catctgggat ccacgtggtc gacagagaat cagctcgcag ctcaccaccc cagtgacttc
+      541 agggcagccc accttcccct ggcgctcctc aaacgagcca gggagtggcc cctgctcaga
+      601 ctcccctcct gcctcccgga ccctgcaggc ctacccgccc cagttgccct ttgccctcct
+      661 gcagccttct gggggtgcta catgtctgag gcccggtctt ctgtcctgct cctcctgatg
+      721 gggggtctgg gcactctccc taattcatcg cgaagactct gacacccaat gcccgtcttc
+      781 aggccccggc agatgcagag aagtgggctt cacacccaca tctgcctgac ctcaggtgct
+      841 ggctcctgca gtcacagccc tgagccccgg cccctccagg ctgtctcctg cttgtccagg
+      901 tgggcatgag ctggtcagtt cctggccact gcccttcaga ccccatgcca ggactttggg
+      961 ttgggctctg ggcatggcac tagccaggcc tgggtgcctc cttgagcagc tgagggctgg
+     1021 gagggatgac aatgtaagcg gctatctggc ttcaggccca ggctggccat ctggtggcca
+//
+LOCUS       NR_168384               3463 bp    RNA     linear   PRI 13-MAY-2020
+DEFINITION  Homo sapiens LOC105373289 (LOC105373289), transcript variant 5,
+            long non-coding RNA.
+ACCESSION   NR_168384
+VERSION     NR_168384.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..3463
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="1"
+                     /map="1q42.13"
+     gene            1..3463
+                     /gene="LOC105373289"
+                     /note="LOC105373289"
+                     /db_xref="GeneID:105373289"
+     ncRNA           1..3463
+                     /ncRNA_class="lncRNA"
+                     /gene="LOC105373289"
+                     /product="LOC105373289, transcript variant 5"
+                     /db_xref="GeneID:105373289"
+ORIGIN      
+        1 agcagggcgt ccagcggaga aggcagagga ggggagatgc gggctcctcc aggtagcgca
+       61 ggagcccctc cggctgccgg agccccgcga gggcgcgagt ggagggcagg agcccgggcg
+      121 gcggaggagc ggaagggatg ctgcgttgcc ttggagtgtc agggtggggg aggaaagacc
+      181 aagggaccca cgtccttcgc ccccgccgcg gagtcccggg ccggcgagac ttccgcagcc
+      241 tgcccagcgc cggggaccta gggctttgca ggagtccgcc cgggagctct atcagagcgg
+      301 gcgtcctccc cgccgctcca aaggtggctt ggggcaggtg gggcgtcccg gagggaatgg
+      361 agggaccctg cctagggaag tctggcacat tcctccccaa acaccggcgt cttcccatgg
+      421 caggagggat ttcgctgcct gtggggcttc agtgctgaac caggcagccc tgagcagacc
+      481 aggaccgagc ttcccaaacc tgaccgggaa ggagccctgg ttgcatctgg gatccacgtg
+      541 gtcgacagag aatcagctcg cagctcacca ccccagtgac ttcagggcag cccaccttcc
+      601 cctggcgctc ctcaaacgag ccagggagtg gcccctgctc agactcccct cctgcctccc
+      661 ggaccctgca ggcctacccg ccccagttgc cctttgccct cctgcagcct tctgggggtg
+      721 ctacatgtct gaggcccggt cttctgtcct gctcctcctg atggggggtc tgggcactct
+      781 ccctaattca tcgcgaagac tctgacaccc aatgcccgtc ttcaggcccc ggcagatgca
+      841 gagaagtggg cttcacaccc acatctgcct gacctcaggt gctggctcct gcagtcacag
+      901 ccctgagccc cggcccctcc aggctgtctc ctgcttgtcc aggtgggcat gagctggtca
+      961 gttcctggcc actgcccttc agaccccatg ccaggacttt gggttgggct ctgggcatgg
+     1021 agaagtattc tgcttgacat acataaaaat gcataattca aaa
+//
+LOCUS       XR_168380               3423 bp    RNA     linear   PRI 13-MAY-2020
+DEFINITION  Homo sapiens LOC105373289 (LOC105373289), transcript variant 1,
+            long non-coding RNA.
+ACCESSION   XR_168380
+VERSION     XR_168380.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..3423
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="1"
+                     /map="1q42.13"
+     gene            1..3423
+                     /gene="LOC105373289"
+                     /note="LOC105373289"
+                     /db_xref="GeneID:105373289"
+     ncRNA           1..3423
+                     /ncRNA_class="lncRNA"
+                     /gene="LOC105373289"
+                     /product="LOC105373289, transcript variant 1"
+                     /db_xref="GeneID:105373289"
+ORIGIN      
+        1 agtcccaggg aggagaccgc gggagaggcg gcgggaccag ggtcccggcc ttcagcggct
+       61 tgctccgcac actcagggtt ccccggccct ctggcgctgg gggagttggg tcggttgtgc
+      121 atgctgcatg gccggaggct cggggccaag gccacccttc cgcacccacc actctgggag
+      181 gctccagagc gcggccctga gatagtgcca cactcacccc ctggaaagga ggcaaggccg
+      241 ccctggacgg aggcgactcg gagtcccggg aggaaggaac ggacacaccg gcctccctgc
+      301 ggaggaggga gaacgtggtc cccagtggta tcaggaagag tctggcacat tcctccccaa
+      361 acaccggcgt cttcccatgg caggagggat ttcgctgcct gtggggcttc agtgctgaac
+      421 caggcagccc tgagcagacc aggaccgagc ttcccaaacc tgaccgggaa ggagccctgg
+      481 ttgcatctgg gatccacgtg gtcgacagag aatcagctcg cagctcacca ccccagtgac
+      541 ttcagggcag cccaccttcc cctggcgctc ctcaaacgag ccagggagtg gcccctgctc
+      601 agactcccct cctgcctccc ggaccctgca ggcctacccg ccccagttgc cctttgccct
+      661 cctgcagcct tctgggggtg ctacatgtct gaggcccggt cttctgtcct gctcctcctg
+      721 atggggggtc tgggcactct ccctaattca tcgcgaagac tctgacaccc aatgcccgtc
+      781 ttcaggcccc ggcagatgca gagaagtggg cttcacaccc acatctgcct gacctcaggt
+      841 gctggctcct gcagtcacag ccctgagccc cggcccctcc aggctgtctc ctgcttgtcc
+      901 aggtgggcat gagctggtca gttcctggcc actgcccttc agaccccatg ccaggacttt
+      961 gggttgggct ctgggcatgg cactagccag gcctgggtgc ctccttgagc agctgagggc
+     1021 tgggagggat gacaatgtaa gcggctatct ggcttcaggc ccaggctggc catctggtgg
+     1081 aaa
+//
+LOCUS       NM_001242328                925 bp    RNA     linear   PRI 14-MAY-2020
+DEFINITION  Homo sapiens uncharacterized LOC107985524 (LOC107985524), long
+            non-coding RNA.
+ACCESSION   NM_001242328 XM_024452028
+VERSION     NM_001242328.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..925
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="1"
+                     /map="1p11.2"
+     gene            1..925
+                     /gene="LOC107985524"
+                     /note="uncharacterized LOC107985524"
+                     /db_xref="GeneID:107985524"
+     ncRNA           1..925
+                     /ncRNA_class="lncRNA"
+                     /gene="LOC107985524"
+                     /product="uncharacterized LOC107985524"
+                     /db_xref="GeneID:107985524"
+ORIGIN      
+        1 gcacacctgg ctcacggcga gtgcggagca gaaagcacta ctggcgcggg ccacagccag
+       61 ccgctttcat ctgctaagac ctcacctgaa aggcgcacca gtgccctcaa ggatcctccc
+      121 gcctctgcag gatgtcgagg ctcctcctcg ccgggaggag aaagcggaat cccctccctg
+      181 cattctcgga cagtgccacg tcctccggct gccagcgggg cagcgccgct aggtatgtga
+      241 gcttcaaagt tggaagaaat taagcaacat gctttggaat ctatggtgat ctatagaaag
+      301 gcaaagtttc tggactcacc ctgactgatg gaaagacaga ctgcctgcca ggacactacc
+      361 ctgctgtacc cagtcttaag tataataaag atctcatttt ttactgtcaa tgcaagccac
+      421 attttcctat taggaaaatg tgaatgaaac aaagtgctct tcaagagcaa accctgaatt
+      481 atactttggg ttattctctg ttcctcaaaa ggattttgca tctaactgat agtctccaaa
+      541 ttgtaatgac agtatataga tagcttggtg tagacataca ggtcaataca aatggagaaa
+      601 aggcaatttg ccattgaaga atatgtttgc tttaagtaaa gatcaatata ctaagaaagc
+      661 tatacatatc tagacttcca aaaacagatg ggaataaact actcagcaat cagaatattc
+      721 gaagatggca ctctgttcac ttccagagaa aatagttcaa aactgtatct caaagtggat
+      781 ataagctatt gtactagaat tagtccctgt gtgagcattt ggcattataa aataagatgt
+      841 tcccaatgaa aagatcactg gtatgtagat aataaaatgt gaaaataaaa atttaaaaat
+      901 aaaacaaaaa ttatgtgata ataaa
+//
+LOCUS       NM_028389               2517 bp    RNA     linear   PRI 17-MAY-2020
+DEFINITION  Homo sapiens ACVR2B antisense RNA 1 (ACVR2B-AS1), long non-coding
+            RNA.
+ACCESSION   NM_028389 XM_001717717
+VERSION     NM_028389.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..2517
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="3"
+                     /map="3p22.2"
+     gene            1..2517
+                     /gene="ACVR2B-AS1"
+                     /note="ACVR2B antisense RNA 1"
+                     /db_xref="GeneID:100128640"
+                     /db_xref="HGNC:HGNC:44161"
+     ncRNA           1..2517
+                     /ncRNA_class="lncRNA"
+                     /gene="ACVR2B-AS1"
+                     /product="ACVR2B antisense RNA 1"
+                     /db_xref="GeneID:100128640"
+                     /db_xref="HGNC:HGNC:44161"
+ORIGIN      
+        1 gctacactta gtgactctga gggacatgca accctccccg catgctgctg ctgctgctgc
+       61 acctacaatc ctgccacccc caatgagatc tgcccacccc tcttggccgc cttccccacg
+      121 ctcaggtttt cctcactctt tccctgggtt ccacgcgccc gcgtagcccg aactccgacc
+      181 ctgaggctcc gcgtcccggc ccccatcgca ggggcgcctc taggaaccag aatcccgcag
+      241 atgactgcac agacaagatc gtgcccccaa gttcggcgag ccgggcgccc accgcgcccc
+      301 cagcccacgc ccccggagtt cctgcgccac ccacagcggc cctgagcttc aatctgcact
+      361 gactgcactc ccatctcctt ggctgcagca cctgattaaa gccttcttcc ttggcaa
+//
+LOCUS       NM_039609                 79 bp    RNA     linear   PRI 17-MAY-2020
+DEFINITION  Homo sapiens microRNA 378e (MIR378E), microRNA.
+ACCESSION   NM_039609
+VERSION     NM_039609.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..79
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="5"
+                     /map="5q35.1"
+     gene            1..79
+                     /gene="MIR378E"
+                     /gene_synonym="mir-378e"
+                     /note="microRNA 378e"
+                     /db_xref="GeneID:100616498"
+                     /db_xref="HGNC:HGNC:41671"
+                     /db_xref="miRBase:MI0016750"
+     precursor_RNA   1..79
+                     /gene="MIR378E"
+                     /gene_synonym="mir-378e"
+                     /product="microRNA 378e"
+                     /db_xref="GeneID:100616498"
+                     /db_xref="HGNC:HGNC:41671"
+                     /db_xref="miRBase:MI0016750"
+ORIGIN      
+        1 ctgactccag tgtccaggcc aggggcagac agtggacaga gaacagtgcc caagaccact
+       61 ggacttggag tcaggacat
+//
+LOCUS       NM_039939                 83 bp    RNA     linear   PRI 24-MAY-2020
+DEFINITION  Homo sapiens microRNA 4779 (MIR4779), microRNA.
+ACCESSION   NM_039939
+VERSION     NM_039939.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..83
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="2"
+                     /map="2p11.2"
+     gene            1..83
+                     /gene="MIR4779"
+                     /note="microRNA 4779"
+                     /db_xref="GeneID:100616159"
+                     /db_xref="HGNC:HGNC:41747"
+                     /db_xref="miRBase:MI0017423"
+     precursor_RNA   1..83
+                     /gene="MIR4779"
+                     /product="microRNA 4779"
+                     /db_xref="GeneID:100616159"
+                     /db_xref="HGNC:HGNC:41747"
+                     /db_xref="miRBase:MI0017423"
+ORIGIN      
+        1 taaatgtctt actgctttta ctgttccctc ctagagtcca ttctttactc taggagggaa
+       61 tagtaaaagc agtaagacat tta
+//
+LOCUS       NM_003928               2843 bp    RNA     linear   PRI 31-MAY-2020
+DEFINITION  Homo sapiens chitinase, acidic pseudogene 2 (CHIAP2), non-coding
+            RNA.
+ACCESSION   NM_003928
+VERSION     NM_003928.2
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..2843
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="1"
+                     /map="1p13.2"
+     gene            1..2843
+                     /gene="CHIAP2"
+                     /note="chitinase, acidic pseudogene 2"
+                     /pseudo
+                     /db_xref="GeneID:149620"
+                     /db_xref="HGNC:HGNC:44463"
+     misc_RNA        1..2843
+                     /gene="CHIAP2"
+                     /product="chitinase, acidic pseudogene 2"
+                     /pseudo
+                     /db_xref="GeneID:149620"
+                     /db_xref="HGNC:HGNC:44463"
+ORIGIN      
+        1 gtctgctcct tgtgctgaca gctgaaatag gctctgccta ccagctgaca tgttacttca
+       61 ccaactgggc ccagaaccag ccaggcctgg ggtgcttcaa gcctgatgac atcgacccct
+      121 gcctctgtac ccacttgatc tacgcctttg ctggaatgca gaacaacgag atcaccacca
+      181 tcgaatggga tgacatgact ctctaccaag ctttcaatgg cctgaaaaac aagtaaatga
+      241 cggaaaacct gagtttcaaa tcttttaacc tttaaggaca gtttaaacaa gatcttccac
+      301 agcagacttc aggctgaaat tccaaacagg ccaacaagca ggtaaattca gctttcttat
+      361 tatttcaagt gcaagaatga ctctaatttt aaggggaatg gctggctcac agaagctagc
+      421 tgctaactaa agcccagctc agttgccaag ggaagcttat aagtccaact actggtggac
+      481 tcagttgaga acaatcttcc acttagaagc aatccaaagc tggcattgat aaagcattca
+      541 gtctccttgg tcaggagatt cactcctagg gaaataattg gaactgtgga gacattgggg
+      601 aaaaaaaaaa aaaaaaaaaa aaa
+//
+LOCUS       XM_107042                 80 bp    RNA     linear   PRI 31-MAY-2020
+DEFINITION  Homo sapiens microRNA 8075 (MIR8075), microRNA.
+ACCESSION   XM_107042
+VERSION     XM_107042.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..80
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="13"
+                     /map="13q34"
+     gene            1..80
+                     /gene="MIR8075"
+                     /gene_synonym="hsa-mir-8075"
+                     /note="microRNA 8075"
+                     /db_xref="GeneID:102465874"
+                     /db_xref="HGNC:HGNC:50172"
+                     /db_xref="miRBase:MI0025911"
+     precursor_RNA   1..80
+                     /gene="MIR8075"
+                     /gene_synonym="hsa-mir-8075"
+                     /product="microRNA 8075"
+                     /db_xref="GeneID:102465874"
+                     /db_xref="HGNC:HGNC:50172"
+                     /db_xref="miRBase:MI0025911"
+ORIGIN      
+        1 ccttgctgat ggcagatgtc ggatctgcct cgcttatacg tgcccttgct gatggcagat
+       61 gtcgggtctg cctcgcttat
+//
+LOCUS       XM_120501                718 bp    RNA     linear   PRI 31-MAY-2020
+DEFINITION  Homo sapiens SYNE1 antisense RNA 1 (SYNE1-AS1), long non-coding
+            RNA.
+ACCESSION   XM_120501
+VERSION     XM_120501.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..718
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="6"
+                     /map="6q25.2"
+     gene            1..718
+                     /gene="SYNE1-AS1"
+                     /note="SYNE1 antisense RNA 1"
+                     /db_xref="GeneID:100505475"
+                     /db_xref="HGNC:HGNC:40793"
+     ncRNA           1..718
+                     /ncRNA_class="lncRNA"
+                     /gene="SYNE1-AS1"
+                     /product="SYNE1 antisense RNA 1"
+                     /db_xref="GeneID:100505475"
+                     /db_xref="HGNC:HGNC:40793"
+ORIGIN      
+        1 aaaaccacac agaggagagc taatcgggga gataactatt tgctgtgctg taggagatta
+       61 ccaagagggt taactgctga gcccaggttt tcagaagggc actgaactgt tccagggcct
+      121 gctccagttg agccacttgg cctgagaatt cctgctccga aagggccatc tggctgacca
+      181 ggttctccaa acagctctgc gtttggaata cactgtcttc ccactgcttc cagtcggcac
+      241 gcagggcctg catctccgtg tgcatgagct cacacccact ggcagttgtg ttctgtttca
+      301 cttcgggagc cagcgactcc actctgctga gacggcttgc accaatctct ctggaatcta
+      361 tcagctcctg taatggaata tcaccatggt aactgaagag cctgtgagtc acttggactg
+      421 caagttttca actgtgtaca cagggggacc ctgtcctgcc aggaagtttt aacaagtgtg
+      481 ccacaaggac ccagaatcaa ttcatctgtc cactcttaaa ttataatagc gggagttgtc
+      541 atcacgagtg cctaaggctt atggcaaaaa aatcccgaga atcctcaata tcctgaaaca
+      601 gactaaacag agaccctgaa atcaggcatt attcgcacac gtgaaaatgt ttagtgactc
+      661 aagtgtttgc ctgtggtgga ttgctcctgt gaatgattaa acccatattt ccctcaaa
+//
+LOCUS       NR_038942               1432 bp    RNA     linear   PRI 31-MAY-2020
+DEFINITION  Bos taurus promoter of MAT2A antisense radiation-induced
+            circulating long non-coding RNA (PARTICL), long non-coding RNA.
+ACCESSION   NR_038942
+VERSION     NR_038942.1
+KEYWORDS    RefSeq.
+SOURCE      Bos taurus (cow)
+  ORGANISM  Bos taurus
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..1432
+                     /organism="Bos taurus"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9913"
+                     /chromosome="2"
+                     /map="2p11.2"
+     gene            1..1432
+                     /gene="PARTICL"
+                     /gene_synonym="PARTICLE"
+                     /note="promoter of MAT2A antisense radiation-induced
+                     circulating long non-coding RNA"
+                     /db_xref="GeneID:100630918"
+                     /db_xref="HGNC:HGNC:50886"
+                     /db_xref="MIM:616350"
+     ncRNA           1..1432
+                     /ncRNA_class="lncRNA"
+                     /gene="PARTICL"
+                     /gene_synonym="PARTICLE"
+                     /product="promoter of MAT2A antisense radiation-induced
+                     circulating long non-coding RNA"
+                     /db_xref="GeneID:100630918"
+                     /db_xref="HGNC:HGNC:50886"
+                     /db_xref="MIM:616350"
+ORIGIN      
+        1 tgtgggagaa aaagcgactg gggcttgctg gacccgctcc cctgtgcggt gacatggcac
+       61 ctcccttccg accgttggcc gggatagctt tcccggaggt cgcgtccccc agcggaggcg
+      121 gcgggaatcg cggaggtttt gtgctgcggc tggggtcttt ctggccgtcc cgcctctaga
+      181 tgccgggtgg cagggagccg gtggcggtcg gccatgtgaa ggtggccatc ttggccggcc
+      241 agcgagggct cctcacggcc ttccttcggc gtccctgccc ggctcgttgc cggccccggg
+      301 atatctgagg ggcgcaggcc caccttctag tcgtttcctg gtgaatggct ctgctgaaga
+      361 tggccggaaa gcagattaat gaaggtgctg ccatcgattt aaacaatcgc cctcctgccc
+      421 gctcccctgc gctaaagttt ctgagggatc ctcacccttc gtggttcgtg gacttaaaag
+      481 tggaggcagc gctccagcct ttccctccag agagaaagga ggccgctccc aagtccgtcc
+      541 ttgccccgtg gccttcctgt tcctttgaag ggggggggaa tcgatgtttc aatcctctgt
+      601 tcaggagaat atggaacgaa catttctttt ttggtgggtg ggggctattc gttcccttga
+      661 atgtgcttaa gcagatctct tgacggcgtg gaatgggctg tttcatgaag ctttcacttt
+      721 aaaatgtcca cctgcgtttg tcccagtttt gcccaataaa ggaattacag ggaaaaagag
+      781 cgaaacaaaa cttgagccag caaggagtat ggagtcccgt ttggagggag cccgccttgg
+      841 gggcgggggt ctctgcagcc tgttctgggg ctgggcctcg gtgcaggtct ggtcgggctg
+      901 gtggcctggt ggccctgagc ggagcgcgtg acaagaacgc cgggtttaat gaggttctca
+      961 gggaacggcc tgctcctagc atgggatgta ctttacgctg gggaggtgaa ggagacccct
+     1021 agtaacagtc tccagctgcc tactgctggc ggcagtagaa caaggtgcat tcccaagaac
+     1081 aatcctctcc gtcggtgcag gaggaaaggt gggatttgct aaaggctcag tgggaaacaa
+     1141 aggaagcact tgggctgtgc tgggggcatc cacaaagaag ataaagggct cctccccaga
+     1201 ctcagtgacc acatccacgg tcgctcaaag caggcagcaa gaaaaggttc gaccacaaaa
+     1261 gagacaatag gtgatgatat ttttattcgt tgctttttac ttttcaagct aacccttcat
+     1321 gggaaagtaa ctgtataaga ctattaaatt tttttgttat tttttaaagg agaatgccaa
+     1381 tttattaact tacaatgtgt aataaaattg tcaactggaa aaaaaaaaaa aa
+//
+LOCUS       NY_108110                808 bp    RNA     linear   PRI 31-MAY-2020
+DEFINITION  Homo sapiens B4GALT1 antisense RNA 1 (B4GALT1-AS1), transcript
+            variant 3, long non-coding RNA.
+ACCESSION   NY_108110
+VERSION     NY_108110.1
+KEYWORDS    RefSeq.
+SOURCE      Homo sapiens (human)
+  ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+     source          1..808
+                     /organism="Homo sapiens"
+                     /mol_type="transcribed RNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="9"
+                     /map="9p21.1"
+     gene            1..808
+                     /gene="B4GALT1-AS1"
+                     /note="B4GALT1 antisense RNA 1"
+                     /db_xref="GeneID:101929639"
+                     /db_xref="HGNC:HGNC:49910"
+     ncRNA           1..808
+                     /ncRNA_class="lncRNA"
+                     /gene="B4GALT1-AS1"
+                     /product="B4GALT1 antisense RNA 1, transcript variant 3"
+                     /db_xref="GeneID:101929639"
+                     /db_xref="HGNC:HGNC:49910"
+ORIGIN      
+        1 ggaggactgc ccgatggcgg cggcactgtt cgagccgccc tgcagcggtg tggagactcc
+       61 gaccagttgg ggcaggcggc tcaggtcgcg gccagccagg ggttgggaaa ctagtttcca
+      121 tggccaaacc tagcccaccg tctgttttgt ttcctgttct tcccaacctg cgctatggac
+      181 ttcatcagat ttcagcatca gagagaatat ggaaggacat cgaccctaac ttcatccagt
+      241 gaggatttcc acacaccata cactctctga gagttctctt ggctttgtgt gcacacctcc
+      301 agtgacaggg agctcgctat gtcatgaggc agcctgctcc cttgtggcta tcactgaacc
+      361 aactattaag ccttcttata caaacagtcc ctgacttact gttagactta caactttttg
+      421 attttacagt ggtgcaaaag caatatgcat tccatagaaa ctgtatttca aattttgaat
+      481 tttgatcttt tccaggctag caacatatga agaccaacct tctattttta aaataggctt
+      541 tgtgttagat gcttttgccc aactataggt taatgtaaat gttctgagaa tatttgagga
+      601 aggctaggct aaactgtgat ctttgggagc ttagatatgt taagtgcatt ttcaacttac
+      661 aatattttca acttacgatg agtttattgg gatgaagccc gttgtaagta aaggagcatc
+      721 tgcattaagc taaaatccat gtctataact tcctcagtaa tatcaagttt gttccttgga
+      781 gccagagaat aaactgaatc cttttcca
+//
diff --git a/src/python/test/xrefs/parsers/flatfiles/rfam.txt b/src/python/test/xrefs/parsers/flatfiles/rfam.txt
new file mode 100644
index 000000000..0dcfad606
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/rfam.txt
@@ -0,0 +1,381 @@
+# STOCKHOLM 1.0
+
+#=GF AC   RF00001
+#=GF ID   5S_rRNA
+#=GF DE   5S ribosomal RNA
+#=GF AU   Griffiths-Jones SR; 0000-0001-6043-807X
+#=GF AU   Mifsud W; 0000-0002-9805-6461
+#=GF AU   Gardner PP; 0000-0002-7808-1213
+#=GF SE   Szymanski et al, 5S ribosomal database, PMID:11752286
+#=GF SS   Published; PMID:11283358
+#=GF GA   38.00
+#=GF TC   38.00
+#=GF NC   37.90
+#=GF TP   Gene; rRNA;
+#=GF BM   cmbuild -F CM SEED
+#=GF CB   cmcalibrate --mpi CM
+#=GF SM   cmsearch --cpu 4 --verbose --nohmmonly -T 24.99 -Z 549862.597050 CM SEQDB
+#=GF CL   CL00113
+#=GF DR   URL; http://rose.man.poznan.pl/5SData/;
+#=GF DR   SO; 0000652; rRNA_5S;
+#=GF DR   GO; 0003735; structural constituent of ribosome;
+#=GF DR   GO; 0005840; ribosome;
+#=GF RN   [1]
+#=GF RM   11752286
+#=GF RT   5S Ribosomal RNA Database.
+#=GF RA   Szymanski M, Barciszewska MZ, Erdmann VA, Barciszewski J
+#=GF RL   Nucleic Acids Res 2002;30:176-178.
+#=GF RN   [2]
+#=GF RM   11283358
+#=GF RT   Crystal structure of the ribosome at 5.5 A resolution.
+#=GF RA   Yusupov MM, Yusupova GZ, Baucom A, Lieberman K, Earnest TN, Cate JH,
+#=GF RA   Noller HF
+#=GF RL   Science 2001;292:883-896.
+#=GF RN   [3]
+#=GF RM   10926492
+#=GF RT   The role of the central zinc fingers of transcription factor IIIA in
+#=GF RT   binding to 5 S RNA.
+#=GF RA   Searles MA, Lu D, Klug A
+#=GF RL   J Mol Biol 2000;301:47-60.
+#=GF RN   [4]
+#=GF RM   23838690
+#=GF RT   Systematic analysis and evolution of 5S ribosomal DNA in metazoans.
+#=GF RA   Vierna J, Wehner S, Honer zu Siederdissen C, Martinez-Lage A, Marz M
+#=GF RL   Heredity (Edinb). 2013;111:410-421.
+#=GF CC   5S ribosomal RNA (5S rRNA) is a component of the large ribosomal subunit
+#=GF CC   in both prokaryotes and eukaryotes. In eukaryotes, it is synthesised by
+#=GF CC   RNA polymerase III (the other eukaryotic rRNAs are cleaved from a 45S
+#=GF CC   precursor synthesised by RNA polymerase I). In Xenopus oocytes, it has
+#=GF CC   been shown that fingers 4-7 of the nine-zinc finger transcription factor
+#=GF CC   TFIIIA can bind to the central region of 5S RNA. Thus, in addition to
+#=GF CC   positively regulating 5S rRNA transcription, TFIIIA also stabilises 5S
+#=GF CC   rRNA until it is required for transcription.
+#=GF WK   5S_ribosomal_RNA
+#=GF SQ   712
+
+
+
+X01556.1/3-118           --CUUGAC-GA-U-C-AU-AGA----GC-G-U-U-G---GA----------A-CC-A----------C--CUG----A-UC----CCUUCC------CGA-ACUCA-GA-AGUGAA-A-----------------CGACGCA-U-C--G---CC--GAUG----GUAGUGUG----GGGUUUC-C-CCAUG-UGA---G---AGUA----GG-U-CA-UC--G-UCAAGC
+X55260.1/3-119           --UACGGC-GG-C-C-AU-AGC----GA-A-G-G-G---GA----------A-AU-A----------C--CCG----G-UC----CCAUCC------CGA-ACCCG-GA-AGUCAA-G-----------------CCCUUCA-G-C--G---CC--GAUG----GUACUGCAA---CCGAGAG-G-CUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC
+M16174.1/3-119           --UACGGC-GG-C-C-AU-AGC----GG-C-G-G-G---GA----------A-AC-A----------C--CCG----G-UC----CCAUGC------CGA-ACCCG-GA-AGUUAA-G-----------------CCUGCCA-G-C--G---CC--GAUG----GUACUGCAA---CCGAGAG-G-CUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC
+X55267.1/3-119           --UACGGC-GG-C-C-AU-AGC----GG-A-G-G-G---GA----------A-AC-G----------C--CCG----G-UC----CCAUUC------CGA-ACCCG-GA-AGCUAA-G-----------------CCCUCCA-G-C--G---CC--GAUG----GUACUGCAC---CCGAGAG-A-GUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC
+M16172.1/3-119           --UACGGC-GG-U-C-AU-AGC----GA-A-G-G-G---GA----------A-AC-G----------C--CCG----G-UC----CCAUUC------CGA-ACCCG-GA-AGCUAA-G-----------------CCCUUCA-G-C--G---CC--GAUG----GUACUGCAC---UCGCCAG-G-GUGUG-GGA---G---AGUA----GG-A-CG-CC--G-CCGGAC
+#=GC SS_cons             ..((((((.((.(.,.,,.,<<....-<.<.<.<.<...--..........-.<<.-..........-..<<<....<.<<....______.......>>.-->>>.>-.->>---.-.................>>>>>--.>.>..<...<<..-<<---..-<-<<----..-<<____>.>.-----.>>-...>..-->>-....>>.>..).))..).))))):
+#=GC RF                  ..gccuGc.gg.c.C.AU.Acc....ag.c.g.c.g...aA..........a.gc.A..........C..cgG....a.uC....CCAUCc.......Ga.ACuCc.gA.AguUAA.G.................cgcgcUu.g.g..g...Cc..agggUA..GUAcuagGa..UGgGuGAc.C.uCcUG.ggA...A..gacca....gG.u..g.cc..g.Caggcc
+//
+# STOCKHOLM 1.0
+
+#=GF AC   RF00002
+#=GF ID   5_8S_rRNA
+#=GF DE   5.8S ribosomal RNA
+#=GF AU   Griffiths-Jones SR; 0000-0001-6043-807X
+#=GF AU   Mifsud W; 0000-0002-9805-6461
+#=GF SE   Wuyts et al, European LSU rRNA database, PMID:11125083
+#=GF SS   Published; PMID:11125083
+#=GF GA   42.00
+#=GF TC   42.00
+#=GF NC   41.90
+#=GF TP   Gene; rRNA;
+#=GF BM   cmbuild -F CM SEED
+#=GF CB   cmcalibrate --mpi CM
+#=GF SM   cmsearch --cpu 4 --verbose --nohmmonly -T 19.62 -Z 742849.287494 CM SEQDB
+#=GF CL   CL00112
+#=GF DR   URL; http://rrna.uia.ac.be/lsu/index.html;
+#=GF DR   SO; 0000375; rRNA_5_8S;
+#=GF DR   GO; 0003735; structural constituent of ribosome;
+#=GF DR   GO; 0005840; ribosome;
+#=GF RN   [1]
+#=GF RM   11125083
+#=GF RT   The European Large Subunit Ribosomal RNA Database.
+#=GF RA   Wuyts J, De Rijk P, Van de Peer Y, Winkelmans T, De Wachter R
+#=GF RL   Nucleic Acids Res 2001;29:175-177.
+#=GF RN   [2]
+#=GF RM   9108162
+#=GF RT   Role of the 5.8S rRNA in ribosome translocation.
+#=GF RA   Abou Elela S, Nazar RN
+#=GF RL   Nucleic Acids Res 1997;25:1788-1794.
+#=GF RN   [3]
+#=GF RM   9154813
+#=GF RT   Cytoplasmic p53 polypeptide is associated with ribosomes.
+#=GF RA   Fontoura BM, Atienza CA, Sorokina EA, Morimoto T, Carroll RB
+#=GF RL   Mol Cell Biol 1997;17:3146-3154.
+#=GF CC   5.8S ribosomal RNA (5.8S rRNA) is a component of the large subunit of the
+#=GF CC   eukaryotic ribosome. It is transcribed by RNA polymerase I as part of the
+#=GF CC   45S precursor that also contains 18S and 28S rRNA. Functionally, it is
+#=GF CC   thought that 5.8S rRNA may be involved in ribosome translocation [2]. It
+#=GF CC   is also known to form covalent linkage to the p53 tumour suppressor
+#=GF CC   protein [3]. 5.8S rRNA is also found in archaea.
+#=GF WK   5.8S_ribosomal_RNA
+#=GF SQ   61
+
+
+L78065.1/3758-3910     AACCCUAGGCAGGGGAUCACUCGGC-UCAUGGAUCGAUGAAGACCGCAGC-UAAA-UG-CGCGUCAGAAUGU-----GAACUG--CAG-GAC---------ACAU-GAACACCGACA--------CGUUGAACG-AUAUUGC-GCAUUGCAC--------GACUC------AGUGCG--AUGUACACA---UUU---UUGAGUGCCC
+AB011808.1/289-442     AACUUUCAACAACGGAUCUCUUGGU-UCUCGCAUCGAUGAAGAACGCAGC-GAAA-UG-CGAUACGUAAUGU-----GAAUUG--CAG-AAUU--------CCGUGAAUCAUCGAAU--------CUUUGAACGCACAUUGC-GCCCCUUGG--------UAUU--------CCAGG--GGGCAUGCC---UGU---UUGAGCGUCA
+X60705.1/112-265       AACUUUUAACAACGGAUCUCUUGGC-UCUAGCAUCGAUGAAGAACGCAGC-GAAA-CG-CGAUAUGUAGUGU-----GAAUUG--CAG-AAUU--------CAGUGAAUCAUCGAAU--------CUUUGAACGCACAUGGC-GCCUUCCAG--------UAUC--------CUGGG--AGGCAUGCC---UGU---CCGAGCGUCG
+M36008.1/959-1112      GACUCUUAGCGGUGGAUCACUCGGC-UCGUGCGUCGAUGAAGAACGCAGC-UAGC-UG-CGAGAAUUAGUGU-----GAAUUG--CAG-GAC---------ACAUUGAUCAUCGACA--------CUUCGAACGCACCUUGC-GGCCCCGGG--------UUCC-------UCCCGG--GGCUACGCC---UGU---CUGAGGGUCG
+#=GC SS_cons           :::::::::::::::::::::::::.::::::::::::::((((<<<<__.____.>>.>>,,,,,,,<<<-.....---<<_..___.>>--........-------->>>,,,,,........,)))),,,<<<___>>>.<<<<<<<<<........____........>>>>>..>>>>:::::...:::...::::::::::
+#=GC RF                AACUuUuAgCgAUGGAUguCUuGGc.UCucGcaUCGAUGAAgaaCGCaGC.gAAa.uG.CGAUAcgUaauGU.....GAAuuG..CAG.aauu........ccgUGAAUCauCGAau........cuuuGAACGCaaaUuGC.gcccccggg........Uuuu........cccgg..gggcAuguc...UGu...uUgAGuGUCu
+//
+# STOCKHOLM 1.0
+
+#=GF AC   RF00003
+#=GF ID   U1
+#=GF DE   U1 spliceosomal RNA
+#=GF AU   Griffiths-Jones SR; 0000-0001-6043-807X
+#=GF AU   Mifsud W; 0000-0002-9805-6461
+#=GF AU   Moxon SJ; 0000-0003-4644-1816
+#=GF AU   Ontiveros-Palacios N; 0000-0001-8457-4455
+#=GF SE   Zwieb C, The uRNA database, PMID:9016512
+#=GF SS   Published; PMID:2405391
+#=GF GA   66.00
+#=GF TC   66.00
+#=GF NC   65.90
+#=GF TP   Gene; snRNA; splicing;
+#=GF BM   cmbuild -F CM SEED
+#=GF CB   cmcalibrate --mpi CM
+#=GF SM   cmsearch --cpu 4 --verbose --nohmmonly -T 30.00 -Z 742849.287494 CM SEQDB
+#=GF CL   CL00005
+#=GF DR   SO; 0000391; U1_snRNA;
+#=GF DR   GO; 0000395; mRNA 5'-splice site recognition;
+#=GF DR   GO; 0030627; pre-mRNA 5'-splice site binding;
+#=GF DR   GO; 0005685; U1 snRNP;
+#=GF RN   [1]
+#=GF RM   9016512
+#=GF RT   The uRNA database.
+#=GF RA   Zwieb C
+#=GF RL   Nucleic Acids Res 1997;25:102-103.
+#=GF RN   [2]
+#=GF RM   2405391
+#=GF RT   Saccharomyces cerevisiae U1 small nuclear RNA secondary structure contains
+#=GF RT   both universal and yeast-specific domains.
+#=GF RA   Kretzner L, Krol A, Rosbash M
+#=GF RL   Proc Natl Acad Sci U S A 1990;87:851-855.
+#=GF RN   [3]
+#=GF RM   7984237
+#=GF RT   Crystal structure at 1.92 A resolution of the RNA-binding domain of the
+#=GF RT   U1A spliceosomal protein complexed with an RNA hairpin.
+#=GF RA   Oubridge C, Ito N, Evans PR, Teo CH, Nagai K
+#=GF RL   Nature 1994;372:432-438.
+#=GF RN   [4]
+#=GF RM   11297556
+#=GF RT   Two functionally distinct steps mediate high affinity binding of U1A
+#=GF RT   protein to U1 hairpin II RNA.
+#=GF RA   Katsamba PS, Myszka DG, Laird-Offringa IA
+#=GF RL   J Biol Chem 2001;276:21476-21481.
+#=GF RN   [5]
+#=GF RM   19325628
+#=GF RT   Crystal structure of human spliceosomal U1 snRNP at 5.5 A resolution.
+#=GF RA   Pomeranz Krummel DA, Oubridge C, Leung AK, Li J, Nagai K
+#=GF RL   Nature. 2009;458:475-480.
+#=GF RN   [6]
+#=GF RM   30975767
+#=GF RT   Mechanism of 5' splice site transfer for human spliceosome activation.
+#=GF RA   Charenton C, Wilkinson ME, Nagai K
+#=GF RL   Science. 2019;364:362-367.
+#=GF CC   U1 is a small nuclear RNA (snRNA) component of the spliceosome (involved
+#=GF CC   in pre-mRNA splicing). Its 5' end forms complementary base pairs with the
+#=GF CC   5' splice junction, thus defining the 5' donor site of an intron. There
+#=GF CC   are significant differences in sequence and secondary structure between
+#=GF CC   metazoan and yeast U1 snRNAs, the latter being much longer (568
+#=GF CC   nucleotides as compared to 164 nucleotides in human). Nevertheless,
+#=GF CC   secondary structure predictions suggest that all U1 snRNAs share a 'common
+#=GF CC   core' consisting of helices I, II, the proximal region of III, and IV [1].
+#=GF CC   This family does not contain the larger yeast sequences. The structure of
+#=GF CC   U1 spliceosomal RNA has been reported in [5,6]. It present 4 Stem loops
+#=GF CC   (SL1, SL2, SL3, and SL4) and a region call Helix H. SL1, SL2, and SL3 are
+#=GF CC   join for the Helix H, forming a four-helix junction that are separated of
+#=GF CC   SL4. U1 snRNA is important in the precatalytic spliceosome, where the 5'
+#=GF CC   splice site (5'SS) of the pre-mRNA is recognized by pairing with 5'-U1
+#=GF CC   snRNA. Where spliceosome activation is initiated by a disruption of the
+#=GF CC   5′SS–U1 snRNP interaction by the DEAD-box helicase Prp28 [6]. The
+#=GF CC   structure of U1 small nucleolar RNA was reported in PDB:6QX9
+#=GF WK   U1_spliceosomal_RNA
+#=GF SQ   100
+
+
+X06810.1/261-421                        AUACUUACCUGGACGG-GGUCA-AUGG---AUGAUCAA-UAAG-GUCCA-UGGCCU---AGG-GAAGUAACCUCCAUU-----GCACUU-AGGAGG-GGUGCUUU---------CCUA-AGGUCUGCCCAA---GUGG--CAG-AGCCU-ACGUCAUAAUUUGUGGUAG--UGGGGG--CUUGCGUU--CGCGCAGCCCCUUC
+X14417.1/177-340                        AUACUUACCUGGACGG-GGUCA-AUGG---GUAAUCAA-GAAG-UUCCA-UGGCCU---AGG-UUGGUGACCUCCAUU-----GCACUA-AGGAGG-GGUGCUUG---------CCUA-AGGUCGACCCAA---GUGG--UUG-AGCCU-ACGUCAUAAUUUGUUGUUGCAGAGGGG-GCCUGUGUU--CGCGCAGCCCCUAC
+X06809.1/232-392                        AUACUUACCUGGACGG-GGUCA-AUGG---AUGAUCAA-GAAG-GUCCA-UGGCCU---AGG-GAAGUAACCUCCAUU-----GCACUG-AGGAGG-GGUGCCUU---------UCUA-AGGUCUGUCCAA---GUGA--CAG-AGCCU-ACGUCAUAAUUUGUGGUAG--UGGGGG--CCUGCGUU--CGCGCGGCCCCUUU
+URS000032B6B6_9606/1-164                AUACUUACCUGGCAGG-GGAGA-UACC---AUGAUCAC-GAAG-GUGGU-UUUCCCA--GGG-CGAGGCUUAUCCAUU-----GCACUC-CGG-AU-GUGCUGAC---------CCCUGCGAUUUCCCCAAA-UGUGG--GAA-ACUCG-ACUGCAUAAUUUGUGGUAG--UGGGGG--ACUGCGUU--CGCGCUUUCCCCUG
+#=GR URS000032B6B6_9606/1-164 6QX9_1_SS ...........((((..(((((...((...((............))))..))))).(..(((..(.(((.(((((................)).)).).)))..).........)))).(((((((((.(.....)))..))).).)))..)))).................(((((..(..(((......)))..)))))).
+#=GC SS_cons                            :::::::::::((((<.<<<<<.-<<<...<<______.____.>>>>>.>>>>>><..<<<.-<-<<<-<<<<<___.....______._>>->>.>->>>-->.........>>>>,<<<<<<-<<____..__>>..->>->->>>.,))))--------------..-<<<<<.-<-<<<<__..__>>>>->>>>>>:
+#=GC RF                                 AUACUUACCUGGccgg.GGgca.accg...gcGAUCAa.GAAG.gccgg.ugcCCca..ggg.ugaggcuccccCAUU.....GCACUu.cGgagg.gugccgac.........cccuGcGguucccCCaAA..GUGg..ugaaaccCg.AcggCAUAAUUUgUGgUAG..ucgGGG.gaccGcgUU..cGcgCguuCCCcgc
+#=GC RNA_structural_elements            ==========[5pH][================Stem=loop=1============][=========================Stem=loop==========================]=[=======Stem=loop=3==========]==[3pH]================[==========Stem=loop=4========]
+//
+# STOCKHOLM 1.0
+
+#=GF ID   U2
+#=GF DE   U2 spliceosomal RNA
+#=GF AU   Griffiths-Jones SR; 0000-0001-6043-807X
+#=GF AU   Mifsud W; 0000-0002-9805-6461
+#=GF AU   Gardner PP; 0000-0002-7808-1213
+#=GF SE   The uRNA database, PMID:9016512; Griffiths-Jones SR; PMID:18390578
+#=GF SS   Published; PMID:2339054; Griffiths-Jones SR
+#=GF GA   46.00
+#=GF TC   46.00
+#=GF NC   45.90
+#=GF TP   Gene; snRNA; splicing;
+#=GF BM   cmbuild -F CM SEED
+#=GF CB   cmcalibrate --mpi CM
+#=GF SM   cmsearch --cpu 4 --verbose --nohmmonly -E 1000 -Z 549862.597050 CM SEQDB
+#=GF CL   CL00006
+#=GF DR   SO; 0000392; U2_snRNA;
+#=GF DR   GO; 0000348; mRNA branch site recognition;
+#=GF DR   GO; 0045131; pre-mRNA branch point binding;
+#=GF DR   GO; 0005686; U2 snRNP;
+#=GF RN   [1]
+#=GF RM   9016512
+#=GF RT   The uRNA database.
+#=GF RA   Zwieb C
+#=GF RL   Nucleic Acids Res 1997;25:102-103.
+#=GF RN   [2]
+#=GF RM   11424937
+#=GF RT   A conserved pseudouridine modification in eukaryotic U2 snRNA induces a
+#=GF RT   change in branch-site architecture.
+#=GF RA   Newby MI, Greenbaum NL
+#=GF RL   RNA 2001;7:833-845.
+#=GF RN   [3]
+#=GF RM   11350032
+#=GF RT   Crystal structure of a model branchpoint-U2 snRNA duplex containing bulged
+#=GF RT   adenosines.
+#=GF RA   Berglund JA, Rosbash M, Schultz SC
+#=GF RL   RNA 2001;7:682-691.
+#=GF RN   [4]
+#=GF RM   2339054
+#=GF RT   The spliceosomal snRNAs of Caenorhabditis elegans.
+#=GF RA   Thomas J, Lea K, Zucker-Aprison E, Blumenthal T
+#=GF RL   Nucleic Acids Res 1990;18:2633-2642.
+#=GF CC   U2 is a small nuclear RNA (snRNA) component of the spliceosome (involved
+#=GF CC   in pre-mRNA splicing). Complementary binding between U2 snRNA (in an area
+#=GF CC   lying towards the 5' end but 3' to hairpin I) and the branchpoint sequence
+#=GF CC   (BPS) of the intron results the bulging out of an unpaired adenosine, on
+#=GF CC   the BPS, which initiates a nucleophilic attack at the intronic 5' splice
+#=GF CC   site, thus starting the first of two transesterification reactions that
+#=GF CC   mediate splicing.
+#=GF WK   U2_spliceosomal_RNA
+#=GF SQ   208
+
+
+AALT01209640.1/567-377         AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAU--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCAGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGAAUUA------GGAG-UUGGAAUA-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAC------UUCCAGG--AA--------CGGUGCACCCCCU
+AAFR03033875.1/20528-20718     AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGAAACAG------GGAG-UCGGAAUA-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAC------UUCCAGG--AA--------CGGUGCACUUCCC
+AAIY01044029.1/787-597         AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAGAUCUGAUA--UGUCCUCU-AUCUGAGGACAA------------CAUAUUAAA------CGUAUUUUUGGAAAUA------GGAG-UUGGACCA-------GGAGC---U-----U----G-CUCCA-------UCCA-CUCCAC-GCAUCAG---CCUGGUAUUGCAGUAU------UUCCAGG--AA--------UGGUGCAGCCCCU
+AAZO01007389.1/15370-15178     AUCGCU-UCU----CGGCC--UUA-U-GGCUAAGAUCAAA-GUGUAGUAUCUGUUCUUAUCAGCUUAAUAUCUGAUA--CGACCCUC-AUUGAGGGUCCAG-----------AAUAUUAAA------CUGAUUUUUGGAAACG------GAUG-GAGUGUUA-------GGGGC---U-----U----G-CUCCA-------CCUC-CGUCAC-GGGUUGG---CUCGGCAUUGCAGUAC------AGCCGAG--AU--------CGGCCCACCCUUA
+AAYZ01695118.1/310-500         AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGACGCU------GGAG-UUGGACUA-------GGAGC---U-----U----G-CUCCA-------UCCA-CUCCGC-GCAUCGA---CCUGGUAUUGCAGUAC------UUCCAGG--AC--------CGGUGCACCCCGU
+AAHX01044404.1/26102-26292     AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUAUUAUCAGUUUAAUAUCUGAUA--UGUCCUCU-AUCUGAGGACAA------------UAUAUUAAA------UGAAUUUUUGGUACUA------GGAG-UUGGAAUA-------GGAGC---U-----U----G-CUCCA-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAU------UUCCAGG--AA--------UGGUGCACUCCUC
+AACN010750078.1/657-848        AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGAGCAG------GGAGAUGGAAUAG-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCGC-GCAUCGA---CCUGGUAUUGCAGUAC------CUCCAGG--AA--------CGGUGCACCCCCU
+AAFN02000024.1/475809-475596   AUC----UCU----UUGCC--AUU-U-GGCUUAGAUCCA--GUGUAGUAUCUGUUCUUUUCAGUGUAACAGCUGAAA---UGUCAUC-AUUGAUGACUUUACAUUAUGUUACAAAUUUAUA------CUUAUUUUUGGAUAUUGG----GUAG-AUUGAUGUAUUAAAGUGGGC---U-----U----G-CUCAC----AGUCUUU-CUACAU-AGUGUCG---UUGCCA-CUGUACUUUUAUU--UUGGCUU--CU--------GACGCAAAUUCUU
+K00034.1/420-610               AUCGCU-UCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--CGUCCUCU-AUCCGAGGACAA------------UAUAUUAAA------UGGAUUUUUGGAACUA------GGAG-UUGGAAUA-------GGAGC---U-----U----G-CUCCG-------UCCA-CUCCAC-GCAUCGA---CCUGGUAUUGCAGUAC------CUCCAGG--AA--------CGGUGCACCCCCU
+ABDG02000029.1/618164-617972   UUAGCUCUCU----UUGCC--UUU-U-GGCUUAGAUCAA--GUGUAGUAUCUGUUCUUUUCAGUUUAAUCUCUGAAA---GGUCUCU-AA-GGAGACCAAUC----------GUGAUUAUU------CUUAUUUUUGUCCUCA------GGGC-GGUCUCCUC------UGUGC---U-----U----G-CACAU-----GAUUCC-GCCCAC-AGUGUC----CCUGGUAUUACACUGC------CUCCAGG---C--------GACGCGAACACUU
+AP004871.3/124344-124540       AUACCUUUCU----CGGCC--UUU-U-GGCUAAGAUCAA--GUGUAGUAUCUGUUCUUAUCAGUUUAAUAUCUGAUA--UGUGGGCC-AUGUGCCCACUUC-----------GAUAUUAAA------UUUAUUUUUUGUGGGG------GAGG-GCCCACUACA-----GUGGC---U-----U----G-CCACU------GGGGU-CCUCGC-GUGUCGC---CCAGGCGUUGCACUAC------AGCCUGG-GCC--------UGGCGCACCCCAA
+#=GC SS_cons                   ::::::.<<<....-<<<<..___._.>>>>->>>,,,,..,,,,,,,,,,,,,,,,<<<<<<________>>>>>>..,<<<<<<<.___>>>>>>>,,,...........,,,,,,,,,......,,,,,,,,,,,,,,,,......<<<<.<<<<----.......<<<<<..._....._....>.>>>>-......->>>>.>>>>,,.<<<<<<-...<<<<<<__________......_>>>>>>..--........>>>>>>:::::::
+#=GC RF                        AUacCU.UCu....cgGCc..UUU.U.gGCuaaGAUCAA..GUGUAGUAUCUGUUCUUauCAGUuUAAuAuCUGauA..uggccccc.Auugggggccaau...........uauaUUAaa......uuaAUUUUUggaacua......Gugg.gggcauuu.......uggGC...U.....U....G.Cccau......ugccc.ccaCac.ggguuga...ccuggcaUUGCAcUac......cgccagg..uu........cagcccAcccuuu
+//
+# STOCKHOLM 1.0
+
+#=GF AC   RF00005
+#=GF ID   tRNA
+#=GF DE   tRNA
+#=GF AU   Eddy SR; 0000-0001-6676-4706
+#=GF AU   Griffiths-Jones SR; 0000-0001-6043-807X
+#=GF AU   Mifsud W; 0000-0002-9805-6461
+#=GF SE   Eddy SR
+#=GF SS   Published; PMID:8256282
+#=GF GA   29.00
+#=GF TC   29.00
+#=GF NC   28.90
+#=GF TP   Gene; tRNA;
+#=GF BM   cmbuild -F CM SEED
+#=GF CB   cmcalibrate --mpi CM
+#=GF SM   cmsearch --cpu 4 --verbose --nohmmonly -T 22.00 -Z 549862.597050 CM SEQDB
+#=GF CL   CL00001
+#=GF DR   SO; 0000253; tRNA;
+#=GF DR   GO; 0030533; triplet codon-amino acid adaptor activity;
+#=GF RN   [1]
+#=GF RM   8256282
+#=GF RT   The tertiary structure of tRNA and the development of the genetic code.
+#=GF RA   Hou YM
+#=GF RL   Trends Biochem Sci 1993;18:362-364.
+#=GF RN   [2]
+#=GF RM   9023104
+#=GF RT   tRNAscan-SE: a program for improved detection of transfer RNA genes in
+#=GF RT   genomic sequence.
+#=GF RA   Lowe TM, Eddy SR
+#=GF RL   Nucleic Acids Res 1997;25:955-964.
+#=GF CC   Transfer RNA (tRNA) molecules are approximately 80 nucleotides in length.
+#=GF CC   Their secondary structure includes four short double-helical elements and
+#=GF CC   three loops (D, anti-codon, and T loops). Further hydrogen bonds mediate
+#=GF CC   the characteristic L-shaped molecular structure. tRNAs have two regions of
+#=GF CC   fundamental functional importance: the anti-codon, which is responsible
+#=GF CC   for specific mRNA codon recognition, and the 3' end, to which the tRNAs
+#=GF CC   corresponding amino acid is attached (by aminoacyl-tRNA synthetases).
+#=GF CC   tRNAs cope with the degeneracy of the genetic code in two manners: having
+#=GF CC   more than one tRNA (with a specific anti-codon) for a particular amino
+#=GF CC   acid; and 'wobble' base-pairing, i.e. permitting non-standard base-pairing
+#=GF CC   at the 3rd anti-codon position.
+#=GF WK   Transfer_RNA
+#=GF SQ   954
+
+
+AB003409.1/96-167          GGGCCCAU-A-GCUCAGU---GGU---AGAGUG-C-CUCCU-UUGCAAGGAG-GAU------------------------GC--CCUG-GGU-UCG-AA--UCCCA-G-UGGGUCC-A
+CP000660.1/704452-704523   GGGCCGGU-A-GUCUAGC---GGA---AGGAUG-C-CCGCC-UCGCGCGCGG-GAG------------------------AU--CCCG-GGU-UCG-AA--UCCCG-G-CCGGUCC-A
+X63776.1/648-721           CGGCACGU-A-GCGCAGCC-UGGU---AGCGCA-C-CGUCA-UGGGGUGUCG-GGG------------------------GU--CGGA-GGU-UCA-AA--UCCUC-U-CGUGCCG-A
+DQ927305.1/46859-46925     GCUGCUUG-A-AUGGU-----------UUCAGU-G-UGGGC-UCAUUUCCCA-UUA------------------------CU--CAAA-AGU-UCG-AU--UCUUU-U-AAGCGGC-C
+K01561.1/1-74              GCGUUCAU-A-GCUCAGUU--GGUU--AGAGCA-C-CACCU-UGACAUGGUG-GGG------------------------GU--CGUU-GGU-UCG-AG--UCCAA-U-UGAACGC-A
+X17321.1/66-138            GGGUGAUU-A-GCUCAGCU--GGG---AGAGCA-C-CUCCC-UUACAAGGAG-GGG------------------------GU--CGGC-GGU-UCG-AU--CCCGU-C-AUCACCC-A
+AY632242.1/10-80           CAUUAGAU-G-ACUGAA----AG----CAAGUA-C-UGGUC-UCUUAAACCA-UUU------------------------UA--UAGU-AAA-UUA-GC-AUUUAC-U-UCUAAUG-A
+J01404.1/5140-5204         AUCUAUAU-A-GUAUAAA---------AGUAUA-U-UUGAC-UUCCAAUCAU-AAG------------------------G---UCUA-UU--AAU-U----AAUA-G-UAUAGAU-A
+EU273712.1/5242-5176       AGCCUUAA-A-GUGUUU----------AUCAUG-U-CGAAU-UGCAAAUUCG-AAG------------------------G---UGUA-GAG-AAU-C-C-CUCUA-C-UAAGGCU-U
+EU255777.1/1590-1519       UGGGGCGU-G-GCCAAGU---GGU---AAGGCA-A-CGGGU-UUUGGUCCCG-CUA------------------------UU--CGGA-GGU-UCG-AA--UCCUU-C-CGUCCCA-G
+X13994.1/40-129            GAAGAUCG-U-CGUCUCC---GGUG--AGGCGG-C-UGGAC-UUCAAAUCCA-GU--UGG-GGCCGCCA--GCGGUCCCG----GGCA-GGU-UCG-AC--UCCUG-U-GAUCUUCC-
+M21681.1/156-228           CGCGGGGU-G-GAGCAGCC-UGGU---AGCUCG-U-CGG-C-UCAUAACCCG-AAG------------------------GU--CGUC-GGU-UCA-AA--UCCGG-C-CCCCGCA-A
+K00197.1/1-71              GCGGGCGU-A-GUUCAAU---GGU---AGAACG-A-GAGCU-UCCCAAGCUC-UAU------------------------A---CGAG-GGU-UCG-AU--UCCCU-U-CGCCCGC-U
+X51770.1/245-317           GGCCGCGU-G-GCGCAAU---GGAU--AACGCG-U-CUGCC-UACGGAGCAG-AAG------------------------AU--UGCA-GGU-UCG-AA--UCCUG-C-CGUGGUC-G
+K01856.1/1-82              GGAGAGAU-G-GCCGAGC---GGUCU-AAGGCG-C-UGGUU-UAAGGCACCA-GU--CCC-----UUC---G-----GGGG---CGUG-GGU-UCG-AA--UCCCA-C-UCUCUUC-A
+Z83129.1/22044-21973       UCCUCGGU-A-GUAUAGU---GGUG--AGUAUC-C-GCGUC-UGUCACAUGC-GAG------------------------A---CCCG-GGU-UCA-AU--UCCCG-G-CCGGGGA-G
+M25476.1/51-122            AGCAGCGU-G-GCGCAGU---GGA---AGCGUG-C-UGGGC-CCAUAACCCA-GAG------------------------GU--CGGU-GGA-UCG-AA--ACCAC-U-CGCUGCU-A
+X54124.1/910-981           GACUGCUU-G-GCGCAAU---GGU---AGCGCG-U-UCGAC-UCCAGAUCGA-AAG------------------------GU--UGGG-CGU-UCG-AU--CCGCU-C-AGUGGUC-A
+K01390.1/442-514           GAGCCAUU-A-GCUCAGUU--GGU---AGAGCA-U-CUGAC-UUUUAAUCAG-AGG------------------------GU--CGAA-GGU-UCG-AG--UCCUU-C-AUGGCUC-A
+#=GC SS_cons               (((((((,.,.<<<<___...___..._>>>>,.<.<<<<_.______>>>>.>,,........................,...,<<<.<<_.___.__.._>>>>.>.))))))).:
+#=GC RF                    GgagauaU.A.GCucAgU...GGU...AgaGCg.u.cgGaC.UuaaAAuCcg.aag........................g...cgcg.GGU.UCg.Aa..UCCcg.c.uaucucC.a
+//
+# STOCKHOLM 1.0
+
+#=GF AC   RF00006
+#=GF ID   Vault
+#=GF DE   Vault RNA
+#=GF AU   Bateman A; 0000-0002-6982-4660
+#=GF AU   Gardner PP; 0000-0002-7808-1213
+#=GF SE   Published; PMID:19491402
+#=GF SS   Published; PMID:19491402
+#=GF GA   34.00
+#=GF TC   34.10
+#=GF NC   33.90
+#=GF TP   Gene;
+#=GF BM   cmbuild -F CM SEED
+#=GF CB   cmcalibrate --mpi CM
+#=GF SM   cmsearch --cpu 4 --verbose --nohmmonly -E 1000 -Z 549862.597050 CM SEQDB
+#=GF DR   URL; http://vaults.arc.ucla.edu/sci/sci_home.htm;
+#=GF DR   SO; 0000404; vault_RNA;
+#=GF RN   [1]
+#=GF RM   19491402
+#=GF RT   Evolution of Vault RNAs.
+#=GF RA   Stadler PF, Chen JJ, Hackermueller J, Hoffmann S, Horn F, Khaitovich P,
+#=GF RA   Kretzschmar AK, Mosig A, Prohaska SJ, Qi X, Schutt K, Ullmann K
+#=GF RL   Mol Biol Evol. 2009;27:1-17.
+#=GF CC   This family of RNAs are found as part of the enigmatic vault
+#=GF CC   ribonucleoprotein complex. The complex consists of a major vault protein
+#=GF CC   (MVP), two minor vault proteins (VPARP and TEP1), and several small
+#=GF CC   untranslated RNA molecules. It has been suggested that the vault complex
+#=GF CC   is involved in drug resistance. We have identified a putative novel vault
+#=GF CC   RNA on chromosome 5 EMBL:AC005219.
+#=GF WK   Vault_RNA
+#=GF SQ   73
+
+
+AF045145.1/1-88              -GGCUGGC-UUUAGCUC-AGCGGUUACUUCGCGUGUCAUCAAACCACCUCUCU-------------------------------------------------------GGGUUGUUCGAGAC-----------------CCGCGGGCGCUCUCCAGCCCUCUU
+AADA01028285.1/4499-4587     GGGCUGGC-UUUAGCUC-AGCGGUUACUUCGCGUGUCAUCAAACCACCUCUCU-------------------------------------------------------GGGUGGUUCGAGAC-----------------CCGUGGGCGCUCUCCAUCUCUCUU
+AANU01185798.1/9761-9673     GGGCUGGC-UUUAGCUC-AGCGGUUACUUCGCAGUUCAGCAAACCACCUCUCU-------------------------------------------------------GGGUUGUUCGAGAC-----------------CCGCGGGCACUCUCCAGCCCUUUU
+AAGJ04060733.1/252-131       UGGCUGGC--UAAGCAG-UGUGGAUACUUCGUAUAGCUAAAUGGAAUAGCACUUACUAAUCACAUGAG------------------------------------UAUUUGUGGGUUCGAUCCCCACUAC--GUCUCUUUCUACAUGUGCGUUCCAGCUUUUUU
+#=GC SS_cons                 :<<<<<<-.----<<<<.-<<<<----<<<<<________________________________...........................................________>>>>>--.................>>>>>>>>----->>>>>>:::::
+#=GC RF                      GgGccGGC.UUUAGCuc.AGcGGUUACuUCgacuauuuuaauuuuauuuaucuuauuuuuuuuu...........................................uuguuGGUucGAgaC.................CCgCggGCGCUcUCCggCccUUUU
+//
diff --git a/src/python/test/xrefs/parsers/flatfiles/rgd.txt b/src/python/test/xrefs/parsers/flatfiles/rgd.txt
new file mode 100644
index 000000000..f0da555bf
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/rgd.txt
@@ -0,0 +1,98 @@
+# RGD-PIPELINE: ftp-file-extracts
+# MODULE: genes  build 2024-03-11
+# GENERATED-ON: 2024/05/17
+# PURPOSE: information about active Rat genes extracted from RGD database
+# SPECIES: Rattus norvegicus (Norway rat) NCBI:txid10116
+# CONTACT: rgd.data@mcw.edu
+# FORMAT: tab delimited text
+# NOTES: multiple values in a single column are separated by ';'
+#
+### Apr  1, 2011 RATMAP_IDs and RHDB_IDs are discontinued.
+### Apr 15, 2011 GENE_REFSEQ_STATUS column is provided.
+### Jul  1, 2011 fixed generation of CURATED_REF_PUBMED_IDs and UNCURATED_PUBMED_IDs
+### Nov 23, 2011 no format changes (UniGene Ids are extracted from db in different way)
+### Dec 19, 2011 fixed documentation in header to be consistent with column names
+### Jul  6, 2012 added generation of file GENES_RAT_5.0
+### Oct 23, 2012 obsoleted column 23 'UNCURATED_REF_MEDLINE_ID' - changed to '(UNUSED)'
+### Aug 19, 2013 gene descriptions made consistent with gene report pages from RGD website
+### Oct  2, 2014 genes files refactored:
+###   GENES_RAT_5.0.txt and GENES_RAT_6.0.txt retired -- added new columns to GENES_RAT.txt to accommodate positions for Rnor_5.0 and Rnor_6.0.
+### May 25, 2017 GENE_REFSEQ_STATUS is now published in column 23 for all species
+###   during transition period, for rat, mouse and human, GENE_REFSEQ_STATUS will continue to be also published in columns 39, 41 and 42 respectively
+### Nov 1, 2018  renamed columns: SSLP_RGD_ID => MARKER_RGD_ID, SSLP_SYMBOL => MARKER_SYMBOL
+### Jun 17 2019  data sorted by RGD ID; files exported into species specific directories
+### Mar 11 2020  added Ensembl map positions
+### Jan 18 2021  discontinued column 27 UNIGENE ID
+### Feb 12 2021  added export of positions on assembly mRatBN7.2; discontinued export of positions on assembly RGSCv3.1 (columns 6,12,13,14)
+### Jan 25 2022  rat Ensembl positions exported for mRatBN7.2 assembly
+### Apr 18 2022  added export of canonical proteins in column 27
+### Mar 10 2023  no more 'protein_coding' gene types: 'protein-coding' used instead
+### Mar 11 2024  added export of positions on assembly GRCr8
+#
+#COLUMN INFORMATION:
+# (First 38 columns are in common between all species)
+#
+#1   GENE_RGD_ID	      the RGD_ID of the gene
+#2   SYMBOL             official gene symbol
+#3   NAME    	          gene name
+#4   GENE_DESC          gene description (if available)
+#5   CHROMOSOME_CELERA  chromosome for Celera assembly
+#6   CHROMOSOME_mRatBN7.2 chromosome for reference assembly mRatBN7.2
+#7   CHROMOSOME_RGSC_v3.4 chromosome for reference assembly RGSC_v3.4
+#8   FISH_BAND          fish band information
+#9   START_POS_CELERA   start position for Celera assembly
+#10  STOP_POS_CELERA    stop position for Celera assembly
+#11  STRAND_CELERA      strand information for Celera assembly
+#12  START_POS_mRatBN7.2   start position for reference assembly mRatBN7.2
+#13  STOP_POS_mRatBN7.2    stop position for reference assembly mRatBN7.2
+#14  STRAND_mRatBN7.2      strand information for reference assembly mRatBN7.2
+#15  START_POS_RGSC_v3.4   start position for reference assembly RGSC_v3.4
+#16  STOP_POS_RGSC_v3.4    stop position for reference assembly RGSC_v3.4
+#17  STRAND_RGSC_v3.4      strand information for reference assembly RGSC_v3.4
+#18  CURATED_REF_RGD_ID     RGD_ID of paper(s) used to curate gene
+#19  CURATED_REF_PUBMED_ID  PUBMED_ID of paper(s) used to curate gene
+#20  UNCURATED_PUBMED_ID    PUBMED ids of papers associated with the gene at NCBI but not used for curation
+#21  NCBI_GENE_ID           NCBI Gene ID
+#22  UNIPROT_ID             UniProtKB id(s)
+#23  GENE_REFSEQ_STATUS     gene RefSeq Status (from NCBI)
+#24  GENBANK_NUCLEOTIDE     GenBank Nucleotide ID(s)
+#25  TIGR_ID                TIGR ID(s)
+#26  GENBANK_PROTEIN        GenBank Protein ID(s)
+#27  CANONICAL_PROTEIN      UniProt canonical protein(s)
+#28  MARKER_RGD_ID          RGD_ID(s) of markers associated with given gene
+#29  MARKER_SYMBOL          marker symbol
+#30  OLD_SYMBOL             old symbol alias(es)
+#31  OLD_NAME               old name alias(es)
+#32  QTL_RGD_ID             RGD_ID(s) of QTLs associated with given gene
+#33  QTL_SYMBOL             QTL symbol
+#34  NOMENCLATURE_STATUS    nomenclature status
+#35  SPLICE_RGD_ID          RGD_IDs for gene splices
+#36  SPLICE_SYMBOL          symbol for gene
+#37  GENE_TYPE              gene type
+#38  ENSEMBL_ID             Ensembl Gene ID
+#39  (UNUSED)               blank
+#40  CHROMOSOME_Rnor_5.0      chromosome for Rnor_5.0 reference assembly
+#41  START_POS_Rnor_5.0       start position for Rnor_5.0 reference assembly
+#42  STOP_POS_Rnor_5.0        stop position for Rnor_5.0 reference assembly
+#43  STRAND_Rnor_5.0          strand information for Rnor_5.0 reference assembly
+#44  CHROMOSOME_Rnor_6.0      chromosome for Rnor_6.0 reference assembly
+#45  START_POS_Rnor_6.0       start position for Rnor_6.0 reference assembly
+#46  STOP_POS_Rnor_6.0        stop position for Rnor_6.0 reference assembly
+#47  STRAND_Rnor_6.0          strand information for Rnor_6.0 reference assembly
+#48  CHROMOSOME_ENSEMBL     chromosome for mRatBN7.2 Ensembl assembly
+#49  START_POS_ENSEMBL      start position for mRatBN7.2 Ensembl assembly
+#50  STOP_POS_ENSEMBL       stop position for mRatBN7.2 Ensembl assembly
+#51  STRAND_ENSEMBL         strand information for mRatBN7.2 Ensembl assembly
+#52  CHROMOSOME_GRCr8      chromosome for GRCr8 NCBI assembly
+#53  START_POS_GRCr8       start position for GRCr8 NCBI assembly
+#54  STOP_POS_GRCr8        stop position for GRCr8 NCBI assembly
+#55  STRAND_GRCr8          strand information for GRCr8 NCBI assembly
+#
+GENE_RGD_ID	SYMBOL	NAME	GENE_DESC	CHROMOSOME_CELERA	CHROMOSOME_mRatBN7.2	CHROMOSOME_RGSC_v3.4	FISH_BAND	START_POS_CELERA	STOP_POS_CELERA	STRAND_CELERA	START_POS_mRatBN7.2	STOP_POS_mRatBN7.2	STRAND_mRatBN7.2	START_POS_RGSC_v3.4	STOP_POS_RGSC_v3.4	STRAND_RGSC_v3.4	CURATED_REF_RGD_ID	CURATED_REF_PUBMED_ID	UNCURATED_PUBMED_ID	NCBI_GENE_ID	UNIPROT_ID	GENE_REFSEQ_STATUS	GENBANK_NUCLEOTIDE	TIGR_ID	GENBANK_PROTEIN	CANONICAL_PROTEIN	MARKER_RGD_ID	MARKER_SYMBOL	OLD_SYMBOL	OLD_NAME	QTL_RGD_ID	QTL_SYMBOL	NOMENCLATURE_STATUS	SPLICE_RGD_ID	SPLICE_SYMBOL	GENE_TYPE	ENSEMBL_ID	(UNUSED)	CHROMOSOME_Rnor_5.0	START_POS_Rnor_5.0	STOP_POS_Rnor_5.0	STRAND_Rnor_5.0	CHROMOSOME_Rnor_6.0	START_POS_Rnor_6.0	STOP_POS_Rnor_6.0	STRAND_Rnor_6.0	CHROMOSOME_ENSEMBL	START_POS_ENSEMBL	STOP_POS_ENSEMBL	STRAND_ENSEMBL	CHROMOSOME_GRCr8	START_POS_GRCr8	STOP_POS_GRCr8	STRAND_GRCr8
+2003	Asip	agouti signaling protein		3	3	3	q41	142203571	142291520	+	143473584	143561170	+	145445175	145536831	+	68690;70068;1625724;1598407;1580655;1600115;1580654;2313999;2314006;1357925;6480464;6484113;8554872;13792537	10426381;11353396;14633851;15189116;21873635;7987393	12177191;12601169;17247639;17873059;19534427;21949658;29219041;7665913;9454589;9548375	24152	A0A8I6A2G1;F1LQS7;Q99JA2	VALIDATED	AB045587;NM_052979	TC209185	BAB21564;BAB21579;EDL85941;EDL85942;EDL85943;NP_443211;Q99JA2	Q99JA2	5081260;5501161	PMC151376P1;RH142058	A;ASP	agouti;agouti (coat color);agouti switch protein;agouti-signaling protein	70199	Coreg1	APPROVED			protein-coding	ENSRNOG00000017701		3	156860395	156949277	+	3	150492010	150579870	+	3	143555696	143561171	+	3	163933768	164021377	+
+2004	A2m	alpha-2-macroglobulin		4	4	4	q42	143730195	143781190	+	154897770	154947787	+	158103711	158153423	+	70068;70249;67925;619610;704363;704364;1298539;1298570;1549857;1549856;1300048;1598506;1598509;1598510;1302534;1300321;1598710;1598511;1598512;1598513;1331525;1300322;1358261;1358260;1580654;1580655;1600115;2298922;1598407;2298948;6480464;6484113;6907045;7240710;7411612;7401223;8554872;10046031;10046042;10046045;10046010;10046012;10046014;10046021;10046023;10046029;10046030;10046033;10046034;10046036;10046046;10046016;10046018;10046028;10046044;10046015;10046032;10046041;13702087;6892692;13792537;6892693;38500238;1578409	10319853;10848441;10936700;11498265;11779202;11813239;11839752;11952820;12042906;12125811;12133586;12221929;12494268;12809600;12966032;14675603;14960360;15118671;15167684;15509519;16177542;16538883;1710603;17722867;18177927;19240864;20005173;20579363;21478484;21742475;21873635;22434847;2424486;2432068;2436819;2442306;2448189;2450021;2460123;2468362;2475424;2479532;2581948;28266892;32747830;6163339;6202298;9446838;9453001;94834;9697696;9843780	10880251;11435418;12223092;12477932;12538697;15226301;15272003;15489334;17071617;1725450;17487688;17565389;18485748;18701465;19796622;20458337;20848291;21188621;21362503;21642630;21669904;22516433;23376485;23533145;2414291;2466233;2473946;26746007;26895739;27301375;29476059;36894970;9398211;9714181	24153	A0A8L2QY59;A6ILD0;A6ILD1;A6ILD2;P06238;Q4FZY3	PROVISIONAL	AH002120;AH002202;AH003208	TC229016;TC239648	AAA40636;AAA40637;AAA40638;AAA41595;AAA77658;AAH98922;AAW65786;AAX11376;AAX12488;CAA32164;EDM02007;EDM02008;EDM02009;NP_036620;P06238	P06238	10048;10049;42147	D4Arb15;D4Mit20;D4Wox16	A2MAC1;A2m1	alpha-2-M;alpha-2-macroglobulin-P;alpha-2-macroglobulin-like	6903353;724558	Bp353;Plsm2	PROVISIONAL			protein-coding	ENSRNOG00000028896;ENSRNOG00000045772		4	221393233	221442945	+	4	154309426	154359138	+	4	154897877	154947786	+	4	156570163	156619870	+
+	Aanat	aralkylamine N-acetyltransferase		10	10	10	q32.2	100399613	100403925	+	101827072	101831805	+	106709371	106713683	+	70068;70285;67926;619610;632679;628397;1298610;1298540;1298611;1298603;704409;1300232;1580655;1600115;1300048;2302130;2301030;2301033;2301039;2301034;2301036;2301032;2301043;2312676;2301038;2301031;2301041;2301035;2301037;6480464;6907045;7240710;10402751;8553854;13792537	10451024;11125071;11427721;11516836;11854096;12358739;12736803;16024134;16166080;16282194;16441550;16805813;17014691;17164235;17198543;18001324;18048060;18321474;18624957;21873635;6268470;7502081;7592994;8524412;8770929;9054387	11313340;14617573;15046865;15193530;15228600;15798208;16099857;16687309;17363136;17364576;17403780;20210853;21437622;22908386;23080076;23513468;24877634;25594545;27339900;28502584;30890428;31124080;37256589;7545952	25120	A6HKX7;Q4JL74;Q64553;Q64666	VALIDATED	AC123144;CH473948;DQ075321;JAXUCZ010000010;NM_052979	TC222688	AAA92711;AAB38484;AAC52330;AAY86767;EDM06682;NP_036950;Q64666;XP_006247854	Q64666	1626975;1630499;5028123	Aanat;D10Wox52;D11Mit102	AA-NAT;Nat4	Arylalkylamine N - acetyltransferase (Serotonin N - acetyltransferase);arylakylamine N-acetyltransferase;arylalkylamine N - acetyltransferase ;arylalkylamine N-acetyltransferase;seretonin N-acetyltransferase;serotonin N-acetyltransferase;serotonin acetylase			APPROVED			protein-coding			10	105231006	105235322	+	10	105568091	105572407	+	10	101827301	101831801	+	10	102323647	102330639	+
+2007	Abcd3	ATP binding cassette subfamily D member 3		2	2	2	q42	202282471	202317962	-	209852087	209905763	-	218396071	218432172	-	70068;619610;631711;704362;1358265;1580655;1300330;1598654;1598656;1598657;1598658;704409;1600115;1580654;6480464;7240710;8554872;1580664;8553510;8554507;13792537	10366717;11125071;11341945;11883951;12176987;1301993;14561759;15060019;19010322;1968461;21873635;7528830;9108325	10527525;10704444;11248239;11453642;12865426;12915479;14651853;16344115;17542813;17609205;18178290;18614015;18992293;19479899;19686593;19946888;20007743;21460186;21502359;21525035;22871113;25168382;31505169;9425230;9765053;9922452	25270	A0A8I5ZN14;A0A8I6A495;A0A8I6ANP9;A6HVF3;A6HVF4;A6HVF5;P16970	VALIDATED	XM_039101772;XM_039101774;XM_063281326	TC229511	BAA14086;EDL82089;EDL82090;EDL82091;NP_036936;P16970;XP_038957700;XP_038957702;XP_063137396	P16970	5025528;5051803;67314	D2Arb23;RH128671;RH94667	PMP70;PMP70, 70-kDa peroxisomal membrane protein	70 kDa peroxisomal membrane protein;70-kDa peroxisomal membrane protein;ATP-binding cassette sub-family D member 3;ATP-binding cassette, sub-family D (ALD), member 3;ATP-binding cassette, subfamily D (ALD), member 3;Peroxisomal membrane protein 1			APPROVED			protein-coding			2	243374189	243409604	-	2	225335708	225389120	-	2	209852087	209906020	-	2	212536791	212590379	-
+2011	ENSRNOG00000012966	acyl-CoA dehydrogenase, long chain		9	9	9	q32	65813263	65851320	-	68333981	68372149	-	65613130	65651775	-	70068;619610;631718;631739;704362;737633;1600115;704409;1300048;1580654;1580655;2317589;2317678;6480464;6907045;8554872;10402751;8553446;13673745;13792537	11125071;12477932;14728676;15060019;21873635;2777793;3813556;3968063;8660691;9802886	14651853;15489334;15639194;18614015;21151927;23106098;26316108;26767982;8268228;9861014	25287	A0A8I6GMH0;A6KFD6;A6KFD7;P15650	PROVISIONAL	BC062006;CH474044;FQ215575;FQ218275;J05029;JAXUCZ010000009;L11276;NM_012819;XM_063266668	TC203790	AAA40668;AAA41514;AAH62006;EDL75290;EDL75291;NP_036951;P15650;XP_063122738	P15650	5029849;5052821;5506717	ACADL;AW530440;RH142293	LCAD	ACOADA;Acyl Coenzyme A dehydrogenase long chain;Acyl Coenzyme A dehydrogenase, long chain;LCAD long chain acyl-CoA dehydrogenase;LCAD, long chain acyl-CoA dehydrogenase;acetyl-Coenzyme A dehydrogenase, long-chain;acyl-Coenzyme A dehydrogenase, long-chain;long-chain acyl-CoA dehydrogenase;long-chain specific acyl-CoA dehydrogenase, mitochondrial			APPROVED			protein-coding	ENSRNOG00000012966		9	73434371	73472895	+	9	73833368	73871857	-	9	68333980	68372220	-	9	75783689	75822077	-
+2012	Acadm	acyl-CoA dehydrogenase medium chain		2	2	2	q45	234791302	234815446	-	242858865	242883036	-	251866645	251890729	-	70068;619610;70860;631718;631724;631739;704362;1358266;704409;1600115;1598685;1598687;1598688;1598689;1598690;1598691;1300334;1300048;1580655;1580654;2317589;2317678;6480464;6484113;6907045;7240710;8554872;10402751;10047124;8553446;13792537	10958805;11125071;11306811;14728676;15060019;15358373;15850406;15852996;15863369;21873635;23076603;2777793;3611054;3813556;3968063;734877;8615829;8660691;9164869	14651853;16020546;16121256;16972171;18061544;18459129;18614015;1902818;19224950;19428797;19703432;1970566;2029527;21084676;21237683;21630459;23376485;2393404;25416781;26316108;26767982;32227582;3597357	24158	A0A8I5Y8D9;A0A8I5ZQ05;A6HWP6;G3V796;P08503	VALIDATED	BP502473;CH473952;CK359511;FQ214755;J02791;JAXUCZ010000002;NM_016986	TC216640	AAA40670;EDL82532;NP_058682;P08503	P08503	5028777;5035562;5048878;5075084	ACADM;RH133158;RH138389;RH142291		Acyl-Coenzyme A dehydrogenase C-4 to C-12 straight-chain;Acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight-chain;acetyl-Coenzyme A dehydrogenase, medium chain;acyl-CoA dehydrogenase, C-4 to C-12 straight chain;acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight chain;acyl-Coenzyme A dehydrogenase, medium chain;medium-chain acyl-CoA dehydrogenase;medium-chain specific acyl-CoA dehydrogenase, mitochondrial			APPROVED			protein-coding	ENSRNOG00000009845;ENSRNOG00055028387		2	278788485	278812656	-	2	260124418	260148589	-	2	242858865	242883147	-	2	245518693	245542864	-
+2013	Acadsb	acyl-CoA dehydrogenase, short/branched chain		1	1	1	q41	183943792	183982502	+	186188939	186227796	+	190987657	191026275	+	619610;631739;1298221;1358267;704409;1600115;1300336;1300048;1580654;1580655;6480464;6907045;7240710;8554872;10402751;13792537	11125071;12855692;21873635;631739;734879;8660691	10832746;11013134;14651853;18614015;23376485;23474214	25618	A0A0A0MY00;A0A8I6G5Q8;A0A8I6GLN2;A6HWW9;A6HWX0;P70584	PROVISIONAL			AAB17136;EDM11700;EDM11701;NP_037216;P70584;XP_008758088;XP_063138171	P70584	5057173	D1Bda38	2-MEBCAD;LOC103691247;SBCAD	2-methyl branched chain acyl-CoA dehydrogenase;2-methylbutyryl-CoA dehydrogenase;2-methylbutyryl-coenzyme A dehydrogenase;Acyl-Coenzyme A dehydrogenase short-branched chain;Acyl-Coenzyme A dehydrogenase, short-branched chain;acyl-Coenzyme A dehydrogenase, short/branched chain;short/branched chain specific acyl-CoA dehydrogenase, mitochondrial;uncharacterized LOC103691247			PROVISIONAL			protein-coding			1	209013684	209048775	+	1	201981362	202022771	+	1	186188987	186230379	+	1	195619088	195660564	+
diff --git a/src/python/test/xrefs/parsers/flatfiles/ucsc.txt b/src/python/test/xrefs/parsers/flatfiles/ucsc.txt
new file mode 100644
index 000000000..d00e5e2c5
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/ucsc.txt
@@ -0,0 +1,10 @@
+ENST00000619216.1	chr1	-	17368	17436	17368	17368	1	17368,	17436,		uc031tla.1
+ENST00000473358.1	chr1	+	29553	31097	29553	29553	3	29553,30563,30975,	30039,30667,31097,		uc057aty.1
+ENST00000469289.1	chr1	+	30266	31109	30266	30266	2	30266,30975,	30667,31109,		uc057atz.1
+ENST00000607096.1	chr1	+	30365	30503	30365	30365	1	30365,	30503,		uc031tlb.1
+ENST00000417324.1	chr1	-	34553	36081	34553	34553	3	34553,35276,35720,	35174,35481,36081,		uc001aak.4
+ENST00000461467.1	chr1	-	35244	36073	35244	35244	2	35244,35720,	35481,36073,		uc057aua.1
+ENST00000641515.2	chr1	+	65418	71585	65564	70008	3	65418,65519,69036,	65433,65573,71585,	A0A2U3U0J3	uc001aal.2
+ENST00000335137.4	chr1	+	69054	70108	69090	70008	1	69054,	70108,	Q8NH21	uc285fxb.1
+ENST00000466430.5	chr1	-	89294	120932	89294	89294	4	89294,92090,112699,120774,	91629,92240,112804,120932,		uc057aub.1
+ENST00000495576.1	chr1	-	89550	91105	89550	89550	2	89550,90286,	90050,91105,		uc057auc.1
diff --git a/src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt b/src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt
new file mode 100644
index 000000000..3b34a92b6
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/uniprot_release.txt
@@ -0,0 +1,3 @@
+UniProt Knowledgebase Release 2024_03 consists of:
+UniProtKB/Swiss-Prot Release 2024_03 of 29-May-2024
+UniProtKB/TrEMBL Release 2024_03 of 29-May-2024
diff --git a/src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt b/src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt
new file mode 100644
index 000000000..fda4ed35d
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/uniprot_swissprot.txt
@@ -0,0 +1,591 @@
+ID   1433B_HUMAN             Reviewed;         246 AA.
+AC   P31946; A8K9K2; E1P616;
+DT   01-JUL-1993, integrated into UniProtKB/Swiss-Prot.
+DT   23-JAN-2007, sequence version 3.
+DT   29-MAY-2024, entry version 248.
+DE   RecName: Full=14-3-3 protein beta/alpha;
+DE   AltName: Full=Protein 1054;
+DE   AltName: Full=Protein kinase C inhibitor protein 1;
+DE            Short=KCIP-1;
+DE   Contains:
+DE     RecName: Full=14-3-3 protein beta/alpha, N-terminally processed;
+GN   Name=YWHAB;
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606;
+DR   EMBL; X57346; CAA40621.1; -; mRNA.
+DR   EMBL; AK292717; BAF85406.1; -; mRNA.
+DR   EMBL; AL008725; -; NOT_ANNOTATED_CDS; Genomic_DNA.
+DR   EMBL; CH471077; EAW75893.1; -; Genomic_DNA.
+DR   EMBL; CH471077; EAW75894.1; -; Genomic_DNA.
+DR   EMBL; CH471077; EAW75896.1; -; Genomic_DNA.
+DR   EMBL; BC001359; AAH01359.1; -; mRNA.
+DR   PIR; S34755; S34755.
+DR   RefSeq; NP_003395.1; NM_003404.4. [P31946-1]
+DR   RefSeq; NP_647539.1; NM_139323.3. [P31946-1]
+DR   RefSeq; XP_016883528.1; XM_017028039.1.
+DR   PDB; 2BQ0; X-ray; 2.50 A; A/B=2-239.
+DR   PDB; 2C23; X-ray; 2.65 A; A=2-239.
+DR   PDB; 4DNK; X-ray; 2.20 A; A/B=1-246.
+DR   PDB; 5N10; X-ray; 1.60 A; A/B=1-246.
+DR   PDB; 6A5Q; X-ray; 2.00 A; A/B/C=1-246.
+DR   PDB; 6BYK; X-ray; 3.00 A; A/B/C/D=3-232.
+DR   PDB; 6GN0; X-ray; 3.24 A; A/B/C/D=1-239.
+DR   PDB; 6GN8; X-ray; 2.34 A; A/B=1-234.
+DR   PDB; 6GNJ; X-ray; 3.24 A; A/B=1-234.
+DR   PDB; 6GNK; X-ray; 2.55 A; A/B=1-234.
+DR   PDB; 6GNN; X-ray; 3.79 A; A=1-239.
+DR   PDB; 6HEP; X-ray; 1.86 A; A/B/C/D=1-232.
+DR   PDB; 8DP5; EM; 3.10 A; C=1-246.
+DR   PDB; 8EQ8; X-ray; 1.50 A; A/B=1-239.
+DR   PDB; 8EQH; X-ray; 1.90 A; A/B=1-239.
+DR   PDBsum; 2BQ0; -.
+DR   PDBsum; 2C23; -.
+DR   PDBsum; 4DNK; -.
+DR   PDBsum; 5N10; -.
+DR   PDBsum; 6A5Q; -.
+DR   PDBsum; 6BYK; -.
+DR   PDBsum; 6GN0; -.
+DR   PDBsum; 6GN8; -.
+DR   PDBsum; 6GNJ; -.
+DR   PDBsum; 6GNK; -.
+DR   PDBsum; 6GNN; -.
+DR   PDBsum; 6HEP; -.
+DR   PDBsum; 8DP5; -.
+DR   PDBsum; 8EQ8; -.
+DR   PDBsum; 8EQH; -.
+DR   AlphaFoldDB; P31946; -.
+DR   EMDB; EMD-27630; -.
+DR   SASBDB; P31946; -.
+DR   SMR; P31946; -.
+DR   BioGRID; 113361; 1082.
+DR   CORUM; P31946; -.
+DR   DIP; DIP-743N; -.
+DR   ELM; P31946; -.
+DR   IntAct; P31946; 656.
+DR   MINT; P31946; -.
+DR   STRING; 9606.ENSP00000361930; -.
+DR   BindingDB; P31946; -.
+DR   ChEMBL; CHEMBL3710403; -.
+DR   DrugBank; DB09130; Copper.
+DR   DrugBank; DB12695; Phenethyl Isothiocyanate.
+DR   GlyGen; P31946; 1 site, 1 O-linked glycan (1 site).
+DR   iPTMnet; P31946; -.
+DR   MetOSite; P31946; -.
+DR   PhosphoSitePlus; P31946; -.
+DR   SwissPalm; P31946; -.
+DR   BioMuta; YWHAB; -.
+DR   DMDM; 1345590; -.
+DR   OGP; P31946; -.
+DR   REPRODUCTION-2DPAGE; IPI00216318; -.
+DR   CPTAC; CPTAC-142; -.
+DR   jPOST; P31946; -.
+DR   MassIVE; P31946; -.
+DR   MaxQB; P31946; -.
+DR   PaxDb; 9606-ENSP00000361930; -.
+DR   PeptideAtlas; P31946; -.
+DR   PRIDE; P31946; -.
+DR   ProteomicsDB; 54816; -.
+DR   ProteomicsDB; 54817; -. [P31946-2]
+DR   Pumba; P31946; -.
+DR   TopDownProteomics; P31946-1; -. [P31946-1]
+DR   TopDownProteomics; P31946-2; -. [P31946-2]
+DR   Antibodypedia; 1906; 847 antibodies from 46 providers.
+DR   CPTC; P31946; 3 antibodies.
+DR   DNASU; 7529; -.
+DR   Ensembl; ENST00000353703.9; ENSP00000300161.4; ENSG00000166913.13. [P31946-1]
+DR   Ensembl; ENST00000372839.7; ENSP00000361930.3; ENSG00000166913.13. [P31946-1]
+DR   GeneID; 7529; -.
+DR   KEGG; hsa:7529; -.
+DR   MANE-Select; ENST00000353703.9; ENSP00000300161.4; NM_139323.4; NP_647539.1.
+DR   AGR; HGNC:12849; -.
+DR   CTD; 7529; -.
+DR   DisGeNET; 7529; -.
+DR   GeneCards; YWHAB; -.
+DR   HPA; ENSG00000166913; Low tissue specificity.
+DR   neXtProt; NX_P31946; -.
+DR   OpenTargets; ENSG00000166913; -.
+DR   PharmGKB; PA37438; -.
+DR   VEuPathDB; HostDB:ENSG00000166913; -.
+DR   eggNOG; KOG0841; Eukaryota.
+DR   GeneTree; ENSGT01090000260040; -.
+DR   HOGENOM; CLU_058290_1_0_1; -.
+DR   InParanoid; P31946; -.
+DR   OMA; EQHVTII; -.
+DR   OrthoDB; 920089at2759; -.
+DR   PhylomeDB; P31946; -.
+DR   TreeFam; TF102003; -.
+DR   PathwayCommons; P31946; -.
+DR   SignaLink; P31946; -.
+DR   SIGNOR; P31946; -.
+DR   BioGRID-ORCS; 7529; 19 hits in 1156 CRISPR screens.
+DR   ChiTaRS; YWHAB; human.
+DR   EvolutionaryTrace; P31946; -.
+DR   GeneWiki; YWHAB; -.
+DR   Pharos; P31946; Tbio.
+DR   PRO; PR:P31946; -.
+DR   Proteomes; UP000005640; Chromosome 20.
+DR   RNAct; P31946; Protein.
+DR   Bgee; ENSG00000166913; Expressed in endothelial cell and 214 other cell types or tissues.
+DR   ExpressionAtlas; P31946; baseline and differential.
+DR   CDD; cd10022; 14-3-3_beta_zeta; 1.
+DR   Gene3D; 1.20.190.20; 14-3-3 domain; 1.
+DR   IDEAL; IID00038; -.
+DR   InterPro; IPR000308; 14-3-3.
+DR   InterPro; IPR023409; 14-3-3_CS.
+DR   InterPro; IPR036815; 14-3-3_dom_sf.
+DR   InterPro; IPR023410; 14-3-3_domain.
+DR   PANTHER; PTHR18860; 14-3-3 PROTEIN; 1.
+DR   PANTHER; PTHR18860:SF28; 14-3-3 PROTEIN BETA_ALPHA; 1.
+DR   Pfam; PF00244; 14-3-3; 1.
+DR   PIRSF; PIRSF000868; 14-3-3; 1.
+DR   PRINTS; PR00305; 1433ZETA.
+DR   SMART; SM00101; 14_3_3; 1.
+DR   SUPFAM; SSF48445; 14-3-3 protein; 1.
+DR   PROSITE; PS00796; 1433_1; 1.
+DR   PROSITE; PS00797; 1433_2; 1.
+PE   1: Evidence at protein level;
+KW   3D-structure; Acetylation; Alternative initiation; Cytoplasm;
+KW   Direct protein sequencing; Host-virus interaction; Isopeptide bond;
+KW   Membrane; Nitration; Phosphoprotein; Reference proteome; Ubl conjugation;
+KW   Vacuole.
+SQ   SEQUENCE   246 AA;  28082 MW;  6BE1A9BF97468017 CRC64;
+     MTMDKSELVQ KAKLAEQAER YDDMAAAMKA VTEQGHELSN EERNLLSVAY KNVVGARRSS
+     WRVISSIEQK TERNEKKQQM GKEYREKIEA ELQDICNDVL ELLDKYLIPN ATQPESKVFY
+     LKMKGDYFRY LSEVASGDNK QTTVSNSQQA YQEAFEISKK EMQPTHPIRL GLALNFSVFY
+     YEILNSPEKA CSLAKTAFDE AIAELDTLNE ESYKDSTLIM QLLRDNLTLW TSENQGDEGD
+     AGEGEN
+//
+ID   1433E_HUMAN             Reviewed;         255 AA.
+AC   P62258; B3KY71; D3DTH5; P29360; P42655; Q4VJB6; Q53XZ5; Q63631; Q7M4R4;
+DT   05-JUL-2004, integrated into UniProtKB/Swiss-Prot.
+DT   05-JUL-2004, sequence version 1.
+DT   29-MAY-2024, entry version 207.
+DE   RecName: Full=14-3-3 protein epsilon;
+DE            Short=14-3-3E;
+DE            EC=2.1.1.148 {ECO:0000256|HAMAP-Rule:MF_01408};
+GN   Name=YWHAE; Synonyms=YWHAE1;
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606;
+DR   EMBL; U20972; AAC50175.1; -; mRNA.
+DR   EMBL; U54778; AAC50710.1; -; mRNA.
+DR   EMBL; U43399; AAC50625.1; -; mRNA.
+DR   EMBL; U43430; AAD00026.1; -; mRNA.
+DR   EMBL; U28936; AAA75301.1; -; mRNA.
+DR   EMBL; AB017103; BAA32538.1; -; Genomic_DNA.
+DR   EMBL; AY883089; AAX68683.1; -; mRNA.
+DR   EMBL; AK128785; BAG54733.1; -; mRNA.
+DR   EMBL; AK295260; BAG58249.1; -; mRNA.
+DR   EMBL; AK316185; BAH14556.1; -; mRNA.
+DR   EMBL; BT007161; AAP35825.1; -; mRNA.
+DR   EMBL; CH471108; EAW90628.1; -; Genomic_DNA.
+DR   EMBL; CH471108; EAW90629.1; -; Genomic_DNA.
+DR   EMBL; BC000179; AAH00179.1; -; mRNA.
+DR   EMBL; BC001440; AAH01440.1; -; mRNA.
+DR   PIR; A61235; A61235.
+DR   PIR; I38947; I38947.
+DR   RefSeq; NP_006752.1; NM_006761.4. [P62258-1]
+DR   PDB; 2BR9; X-ray; 1.75 A; A=1-233.
+DR   PDB; 3UAL; X-ray; 1.80 A; A=1-232.
+DR   PDB; 3UBW; X-ray; 1.90 A; A=1-234.
+DR   PDB; 6EIH; X-ray; 2.70 A; A=3-232.
+DR   PDB; 7C8E; X-ray; 3.16 A; A/B=1-232.
+DR   PDB; 7V9B; X-ray; 1.85 A; A=1-232.
+DR   PDB; 8DGM; X-ray; 3.20 A; A=1-255.
+DR   PDB; 8DGN; X-ray; 3.16 A; A=1-255.
+DR   PDB; 8DGP; X-ray; 2.70 A; A/B/C/D=1-255.
+DR   PDB; 8DP5; EM; 3.10 A; D=1-255.
+DR   PDB; 8Q1S; X-ray; 3.23 A; A/B=1-255.
+DR   PDBsum; 2BR9; -.
+DR   PDBsum; 3UAL; -.
+DR   PDBsum; 3UBW; -.
+DR   PDBsum; 6EIH; -.
+DR   PDBsum; 7C8E; -.
+DR   PDBsum; 7V9B; -.
+DR   PDBsum; 8DGM; -.
+DR   PDBsum; 8DGN; -.
+DR   PDBsum; 8DGP; -.
+DR   PDBsum; 8DP5; -.
+DR   PDBsum; 8Q1S; -.
+DR   AlphaFoldDB; P62258; -.
+DR   EMDB; EMD-27630; -.
+DR   SMR; P62258; -.
+DR   BioGRID; 113363; 1160.
+DR   CORUM; P62258; -.
+DR   DIP; DIP-36676N; -.
+DR   ELM; P62258; -.
+DR   IntAct; P62258; 679.
+DR   MINT; P62258; -.
+DR   STRING; 9606.ENSP00000264335; -.
+DR   ChEMBL; CHEMBL3329082; -.
+DR   DrugBank; DB01780; Fusicoccin.
+DR   DrugBank; DB12695; Phenethyl Isothiocyanate.
+DR   MoonDB; P62258; Predicted.
+DR   TCDB; 8.A.98.1.10; the 14-3-3 protein (14-3-3) family.
+DR   GlyGen; P62258; 1 site, 1 O-linked glycan (1 site).
+DR   iPTMnet; P62258; -.
+DR   MetOSite; P62258; -.
+DR   PhosphoSitePlus; P62258; -.
+DR   SwissPalm; P62258; -.
+DR   BioMuta; YWHAE; -.
+DR   DMDM; 51702210; -.
+DR   OGP; P42655; -.
+DR   jPOST; P62258; -.
+DR   MassIVE; P62258; -.
+DR   MaxQB; P62258; -.
+DR   PaxDb; 9606-ENSP00000264335; -.
+DR   PeptideAtlas; P62258; -.
+DR   PRIDE; P62258; -.
+DR   ProteomicsDB; 57377; -.
+DR   ProteomicsDB; 57378; -. [P62258-2]
+DR   Pumba; P62258; -.
+DR   TopDownProteomics; P62258-1; -. [P62258-1]
+DR   Antibodypedia; 1898; 612 antibodies from 40 providers.
+DR   CPTC; P62258; 3 antibodies.
+DR   DNASU; 7531; -.
+DR   Ensembl; ENST00000264335.13; ENSP00000264335.8; ENSG00000108953.17. [P62258-1]
+DR   Ensembl; ENST00000571732.5; ENSP00000461762.1; ENSG00000108953.17. [P62258-2]
+DR   Ensembl; ENST00000616643.3; ENSP00000481059.2; ENSG00000274474.3. [P62258-2]
+DR   Ensembl; ENST00000627231.2; ENSP00000487356.1; ENSG00000274474.3. [P62258-1]
+DR   GeneID; 7531; -.
+DR   KEGG; hsa:7531; -.
+DR   MANE-Select; ENST00000264335.13; ENSP00000264335.8; NM_006761.5; NP_006752.1.
+DR   AGR; HGNC:12851; -.
+DR   CTD; 7531; -.
+DR   DisGeNET; 7531; -.
+DR   GeneCards; YWHAE; -.
+DR   HPA; ENSG00000108953; Low tissue specificity.
+DR   MalaCards; YWHAE; -.
+DR   neXtProt; NX_P62258; -.
+DR   OpenTargets; ENSG00000108953; -.
+DR   PharmGKB; PA37440; -.
+DR   VEuPathDB; HostDB:ENSG00000108953; -.
+DR   eggNOG; KOG0841; Eukaryota.
+DR   GeneTree; ENSGT01110000267238; -.
+DR   HOGENOM; CLU_058290_0_0_1; -.
+DR   InParanoid; P62258; -.
+DR   OMA; KGCQLAR; -.
+DR   OrthoDB; 920089at2759; -.
+DR   PhylomeDB; P62258; -.
+DR   TreeFam; TF102003; -.
+DR   PathwayCommons; P62258; -.
+DR   SignaLink; P62258; -.
+DR   SIGNOR; P62258; -.
+DR   BioGRID-ORCS; 7531; 212 hits in 1128 CRISPR screens.
+DR   ChiTaRS; YWHAE; human.
+DR   EvolutionaryTrace; P62258; -.
+DR   GeneWiki; YWHAE; -.
+DR   Pharos; P62258; Tbio.
+DR   PRO; PR:P62258; -.
+DR   Proteomes; UP000005640; Chromosome 17.
+DR   RNAct; P62258; Protein.
+DR   Bgee; ENSG00000108953; Expressed in superior frontal gyrus and 116 other cell types or tissues.
+DR   ExpressionAtlas; P62258; baseline and differential.
+DR   CDD; cd10020; 14-3-3_epsilon; 1.
+DR   Gene3D; 1.20.190.20; 14-3-3 domain; 1.
+DR   IDEAL; IID00512; -.
+DR   InterPro; IPR000308; 14-3-3.
+DR   InterPro; IPR023409; 14-3-3_CS.
+DR   InterPro; IPR036815; 14-3-3_dom_sf.
+DR   InterPro; IPR023410; 14-3-3_domain.
+DR   PANTHER; PTHR18860; 14-3-3 PROTEIN; 1.
+DR   PANTHER; PTHR18860:SF17; 14-3-3 PROTEIN EPSILON; 1.
+DR   Pfam; PF00244; 14-3-3; 1.
+DR   PIRSF; PIRSF000868; 14-3-3; 1.
+DR   PRINTS; PR00305; 1433ZETA.
+DR   SMART; SM00101; 14_3_3; 1.
+DR   SUPFAM; SSF48445; 14-3-3 protein; 1.
+DR   PROSITE; PS00796; 1433_1; 1.
+DR   PROSITE; PS00797; 1433_2; 1.
+PE   1: Evidence at protein level;
+KW   3D-structure; Acetylation; Alternative splicing; Cytoplasm;
+KW   Direct protein sequencing; Host-virus interaction; Isopeptide bond;
+KW   Nucleus; Phosphoprotein; Reference proteome; Ubl conjugation.
+SQ   SEQUENCE   255 AA;  29174 MW;  07817CCBD1F75B26 CRC64;
+     MDDREDLVYQ AKLAEQAERY DEMVESMKKV AGMDVELTVE ERNLLSVAYK NVIGARRASW
+     RIISSIEQKE ENKGGEDKLK MIREYRQMVE TELKLICCDI LDVLDKHLIP AANTGESKVF
+     YYKMKGDYHR YLAEFATGND RKEAAENSLV AYKAASDIAM TELPPTHPIR LGLALNFSVF
+     YYEILNSPDR ACRLAKAAFD DAIAELDTLS EESYKDSTLI MQLLRDNLTL WTSDMQGDGE
+     EQNKEALQDV EDENQ
+//
+ID   1433F_HUMAN             Reviewed;         246 AA.
+AC   Q04917;
+DT   01-OCT-1993, integrated into UniProtKB/Swiss-Prot.
+DT   23-JAN-2007, sequence version 4.
+DT   29-MAY-2024, entry version 238.
+DE   RecName: Full=14-3-3 protein eta;
+DE   AltName: Full=Protein AS1;
+GN   Name=YWHAH; Synonyms=YWHA1;
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606;
+CC   CAUTION: The sequence shown here is derived from an Ensembl
+DR   EMBL; L20422; AAA35483.1; -; mRNA.
+DR   EMBL; X80536; CAA56676.1; -; Genomic_DNA.
+DR   EMBL; X78138; CAA55017.1; -; mRNA.
+DR   EMBL; X57345; CAA40620.1; -; mRNA.
+DR   EMBL; D78577; BAA11418.1; -; Genomic_DNA.
+DR   EMBL; S80794; AAB36036.1; -; mRNA.
+DR   EMBL; CR456612; CAG30498.1; -; mRNA.
+DR   EMBL; Z82248; -; NOT_ANNOTATED_CDS; Genomic_DNA.
+DR   EMBL; BC003047; AAH03047.1; -; mRNA.
+DR   PIR; S34756; S34756.
+DR   PIR; S38509; S38509.
+DR   PIR; S38532; S38532.
+DR   RefSeq; NP_003396.1; NM_003405.3.
+DR   PDB; 2C63; X-ray; 2.15 A; A/B/C/D=2-246.
+DR   PDB; 2C74; X-ray; 2.70 A; A/B=2-246.
+DR   PDB; 7NMZ; X-ray; 2.30 A; AA/BA=1-234.
+DR   PDBsum; 2C63; -.
+DR   PDBsum; 2C74; -.
+DR   PDBsum; 7NMZ; -.
+DR   AlphaFoldDB; Q04917; -.
+DR   SMR; Q04917; -.
+DR   BioGRID; 113365; 1114.
+DR   CORUM; Q04917; -.
+DR   DIP; DIP-27566N; -.
+DR   ELM; Q04917; -.
+DR   IntAct; Q04917; 937.
+DR   MINT; Q04917; -.
+DR   STRING; 9606.ENSP00000248975; -.
+DR   BindingDB; Q04917; -.
+DR   ChEMBL; CHEMBL3708585; -.
+DR   DrugBank; DB12695; Phenethyl Isothiocyanate.
+DR   GlyGen; Q04917; 1 site, 1 O-linked glycan (1 site).
+DR   iPTMnet; Q04917; -.
+DR   MetOSite; Q04917; -.
+DR   PhosphoSitePlus; Q04917; -.
+DR   SwissPalm; Q04917; -.
+DR   BioMuta; YWHAH; -.
+DR   DMDM; 1345593; -.
+DR   jPOST; Q04917; -.
+DR   MassIVE; Q04917; -.
+DR   MaxQB; Q04917; -.
+DR   PaxDb; 9606-ENSP00000248975; -.
+DR   PeptideAtlas; Q04917; -.
+DR   ProteomicsDB; 58300; -.
+DR   Pumba; Q04917; -.
+DR   TopDownProteomics; Q04917; -.
+DR   Antibodypedia; 11204; 346 antibodies from 37 providers.
+DR   DNASU; 7533; -.
+DR   Ensembl; ENST00000248975.6; ENSP00000248975.5; ENSG00000128245.15.
+DR   GeneID; 7533; -.
+DR   KEGG; hsa:7533; -.
+DR   MANE-Select; ENST00000248975.6; ENSP00000248975.5; NM_003405.4; NP_003396.1.
+DR   AGR; HGNC:12853; -.
+DR   CTD; 7533; -.
+DR   DisGeNET; 7533; -.
+DR   GeneCards; YWHAH; -.
+DR   HPA; ENSG00000128245; Tissue enriched (brain).
+DR   neXtProt; NX_Q04917; -.
+DR   OpenTargets; ENSG00000128245; -.
+DR   PharmGKB; PA37442; -.
+DR   VEuPathDB; HostDB:ENSG00000128245; -.
+DR   eggNOG; KOG0841; Eukaryota.
+DR   GeneTree; ENSGT01090000260040; -.
+DR   HOGENOM; CLU_058290_0_0_1; -.
+DR   InParanoid; Q04917; -.
+DR   OMA; IEQKTMS; -.
+DR   OrthoDB; 920089at2759; -.
+DR   PhylomeDB; Q04917; -.
+DR   TreeFam; TF102003; -.
+DR   PathwayCommons; Q04917; -.
+DR   SignaLink; Q04917; -.
+DR   SIGNOR; Q04917; -.
+DR   BioGRID-ORCS; 7533; 15 hits in 1154 CRISPR screens.
+DR   ChiTaRS; YWHAH; human.
+DR   EvolutionaryTrace; Q04917; -.
+DR   GeneWiki; YWHAH; -.
+DR   Pharos; Q04917; Tbio.
+DR   PRO; PR:Q04917; -.
+DR   Proteomes; UP000005640; Chromosome 22.
+DR   RNAct; Q04917; Protein.
+DR   Bgee; ENSG00000128245; Expressed in frontal pole and 196 other cell types or tissues.
+DR   ExpressionAtlas; Q04917; baseline and differential.
+DR   CDD; cd10025; 14-3-3_eta; 1.
+DR   Gene3D; 1.20.190.20; 14-3-3 domain; 1.
+DR   InterPro; IPR000308; 14-3-3.
+DR   InterPro; IPR023409; 14-3-3_CS.
+DR   InterPro; IPR036815; 14-3-3_dom_sf.
+DR   InterPro; IPR023410; 14-3-3_domain.
+DR   PANTHER; PTHR18860; 14-3-3 PROTEIN; 1.
+DR   PANTHER; PTHR18860:SF16; 14-3-3 PROTEIN ETA; 1.
+DR   Pfam; PF00244; 14-3-3; 1.
+DR   PIRSF; PIRSF000868; 14-3-3; 1.
+DR   PRINTS; PR00305; 1433ZETA.
+DR   SMART; SM00101; 14_3_3; 1.
+DR   SUPFAM; SSF48445; 14-3-3 protein; 1.
+DR   PROSITE; PS00796; 1433_1; 1.
+DR   PROSITE; PS00797; 1433_2; 1.
+PE   1: Evidence at protein level;
+KW   3D-structure; Acetylation; Direct protein sequencing; Phosphoprotein;
+KW   Reference proteome.
+SQ   SEQUENCE   246 AA;  28219 MW;  D70FBC100C45D6E5 CRC64;
+     MGDREQLLQR ARLAEQAERY DDMASAMKAV TELNEPLSNE DRNLLSVAYK NVVGARRSSW
+     EAGEGN
+//
+ID   1433G_HUMAN             Reviewed;         247 AA.
+AC   P61981; O70457; P35214; Q6FH52; Q9UDP2; Q9UN99;
+DT   07-JUN-2004, integrated into UniProtKB/Swiss-Prot.
+DT   23-JAN-2007, sequence version 2.
+DT   29-MAY-2024, entry version 197.
+DE   RecName: Full=14-3-3 protein gamma {ECO:0000305};
+DE   AltName: Full=Protein kinase C inhibitor protein 1;
+DE            Short=KCIP-1;
+DE   Contains:
+DE     RecName: Full=14-3-3 protein gamma, N-terminally processed;
+GN   Name=YWHAG {ECO:0000312|HGNC:HGNC:12852};
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606;
+DR   EMBL; AF142498; AAD48408.1; -; mRNA.
+DR   EMBL; AB024334; BAA85184.1; -; mRNA.
+DR   EMBL; CR541904; CAG46702.1; -; mRNA.
+DR   EMBL; CR541925; CAG46723.1; -; mRNA.
+DR   EMBL; AC006388; -; NOT_ANNOTATED_CDS; Genomic_DNA.
+DR   EMBL; BC020963; AAH20963.1; -; mRNA.
+DR   RefSeq; NP_036611.2; NM_012479.3.
+DR   PDB; 2B05; X-ray; 2.55 A; A/B/C/D/E/F=2-247.
+DR   PDB; 3UZD; X-ray; 1.86 A; A=1-247.
+DR   PDB; 4E2E; X-ray; 2.25 A; A=1-247.
+DR   PDB; 4J6S; X-ray; 3.08 A; A/B/C/D=1-247.
+DR   PDB; 4O46; X-ray; 2.90 A; A/B/C/D/E/F=1-247.
+DR   PDB; 5D3E; X-ray; 2.75 A; A/B/E/F/I/J=1-238.
+DR   PDB; 6A5S; X-ray; 2.10 A; A/B/D/G=1-247.
+DR   PDB; 6BYJ; X-ray; 2.90 A; A/B/C/D/E/F=2-241.
+DR   PDB; 6BYL; X-ray; 3.35 A; A/B/C/D/E/F=2-241.
+DR   PDB; 6BZD; X-ray; 2.67 A; A/B/C/D=2-247.
+DR   PDB; 6FEL; X-ray; 2.84 A; A/B/C/D=1-234.
+DR   PDB; 6GKF; X-ray; 2.60 A; A/B/C/D/E/F/G/H=1-234.
+DR   PDB; 6GKG; X-ray; 2.85 A; A/B/C/D/E/F/G/H=1-234.
+DR   PDB; 6S9K; X-ray; 1.60 A; A=1-234.
+DR   PDB; 6SAD; X-ray; 2.75 A; A/B=1-234.
+DR   PDB; 6Y4K; X-ray; 3.00 A; A/B=1-234.
+DR   PDB; 6Y6B; X-ray; 3.08 A; A/B=1-234.
+DR   PDB; 6ZBT; X-ray; 1.80 A; A/B/C/D=1-234.
+DR   PDB; 6ZC9; X-ray; 1.90 A; A/B/C/D=1-234.
+DR   PDB; 7A6R; X-ray; 2.70 A; A/B/C/D=1-234.
+DR   PDB; 7A6Y; X-ray; 2.50 A; A/B/C/D=1-234.
+DR   PDBsum; 2B05; -.
+DR   PDBsum; 3UZD; -.
+DR   PDBsum; 4E2E; -.
+DR   PDBsum; 4J6S; -.
+DR   PDBsum; 4O46; -.
+DR   PDBsum; 5D3E; -.
+DR   PDBsum; 6A5S; -.
+DR   PDBsum; 6BYJ; -.
+DR   PDBsum; 6BYL; -.
+DR   PDBsum; 6BZD; -.
+DR   PDBsum; 6FEL; -.
+DR   PDBsum; 6GKF; -.
+DR   PDBsum; 6GKG; -.
+DR   PDBsum; 6S9K; -.
+DR   PDBsum; 6SAD; -.
+DR   PDBsum; 6Y4K; -.
+DR   PDBsum; 6Y6B; -.
+DR   PDBsum; 6ZBT; -.
+DR   PDBsum; 6ZC9; -.
+DR   PDBsum; 7A6R; -.
+DR   PDBsum; 7A6Y; -.
+DR   AlphaFoldDB; P61981; -.
+DR   SASBDB; P61981; -.
+DR   SMR; P61981; -.
+DR   BioGRID; 113364; 1250.
+DR   CORUM; P61981; -.
+DR   DIP; DIP-33406N; -.
+DR   ELM; P61981; -.
+DR   IntAct; P61981; 1062.
+DR   MINT; P61981; -.
+DR   STRING; 9606.ENSP00000306330; -.
+DR   BindingDB; P61981; -.
+DR   ChEMBL; CHEMBL1293296; -.
+DR   TCDB; 8.A.98.1.11; the 14-3-3 protein (14-3-3) family.
+DR   GlyGen; P61981; 1 site, 1 O-linked glycan (1 site).
+DR   iPTMnet; P61981; -.
+DR   MetOSite; P61981; -.
+DR   PhosphoSitePlus; P61981; -.
+DR   SwissPalm; P61981; -.
+DR   BioMuta; YWHAG; -.
+DR   DMDM; 48428721; -.
+DR   REPRODUCTION-2DPAGE; IPI00220642; -.
+DR   CPTAC; CPTAC-450; -.
+DR   CPTAC; CPTAC-451; -.
+DR   jPOST; P61981; -.
+DR   MassIVE; P61981; -.
+DR   MaxQB; P61981; -.
+DR   PaxDb; 9606-ENSP00000306330; -.
+DR   PeptideAtlas; P61981; -.
+DR   PRIDE; P61981; -.
+DR   ProteomicsDB; 57355; -.
+DR   Pumba; P61981; -.
+DR   TopDownProteomics; P61981; -.
+DR   Antibodypedia; 4339; 621 antibodies from 41 providers.
+DR   DNASU; 7532; -.
+DR   Ensembl; ENST00000307630.5; ENSP00000306330.3; ENSG00000170027.7.
+DR   GeneID; 7532; -.
+DR   KEGG; hsa:7532; -.
+DR   MANE-Select; ENST00000307630.5; ENSP00000306330.3; NM_012479.4; NP_036611.2.
+DR   AGR; HGNC:12852; -.
+DR   CTD; 7532; -.
+DR   DisGeNET; 7532; -.
+DR   GeneCards; YWHAG; -.
+DR   HPA; ENSG00000170027; Tissue enhanced (brain, skeletal muscle).
+DR   MalaCards; YWHAG; -.
+DR   neXtProt; NX_P61981; -.
+DR   OpenTargets; ENSG00000170027; -.
+DR   PharmGKB; PA37441; -.
+DR   VEuPathDB; HostDB:ENSG00000170027; -.
+DR   eggNOG; KOG0841; Eukaryota.
+DR   GeneTree; ENSGT01090000260040; -.
+DR   HOGENOM; CLU_058290_0_0_1; -.
+DR   InParanoid; P61981; -.
+DR   OMA; AYGEAHE; -.
+DR   OrthoDB; 920089at2759; -.
+DR   PhylomeDB; P61981; -.
+DR   TreeFam; TF102003; -.
+DR   PathwayCommons; P61981; -.
+DR   SignaLink; P61981; -.
+DR   SIGNOR; P61981; -.
+DR   BioGRID-ORCS; 7532; 29 hits in 1163 CRISPR screens.
+DR   ChiTaRS; YWHAG; human.
+DR   EvolutionaryTrace; P61981; -.
+DR   GeneWiki; YWHAG; -.
+DR   Pharos; P61981; Tchem.
+DR   PRO; PR:P61981; -.
+DR   Proteomes; UP000005640; Chromosome 7.
+DR   RNAct; P61981; Protein.
+DR   Bgee; ENSG00000170027; Expressed in lateral nuclear group of thalamus and 193 other cell types or tissues.
+DR   CDD; cd10024; 14-3-3_gamma; 1.
+DR   Gene3D; 1.20.190.20; 14-3-3 domain; 1.
+DR   InterPro; IPR000308; 14-3-3.
+DR   InterPro; IPR023409; 14-3-3_CS.
+DR   InterPro; IPR036815; 14-3-3_dom_sf.
+DR   InterPro; IPR023410; 14-3-3_domain.
+DR   PANTHER; PTHR18860; 14-3-3 PROTEIN; 1.
+DR   PANTHER; PTHR18860:SF22; 14-3-3 PROTEIN GAMMA; 1.
+DR   Pfam; PF00244; 14-3-3; 1.
+DR   PIRSF; PIRSF000868; 14-3-3; 1.
+DR   PRINTS; PR00305; 1433ZETA.
+DR   SMART; SM00101; 14_3_3; 1.
+DR   SUPFAM; SSF48445; 14-3-3 protein; 1.
+DR   PROSITE; PS00796; 1433_1; 1.
+DR   PROSITE; PS00797; 1433_2; 1.
+PE   1: Evidence at protein level;
+KW   3D-structure; Acetylation; Cytoplasm; Direct protein sequencing;
+KW   Disease variant; Epilepsy; Phosphoprotein; Reference proteome.
+SQ   SEQUENCE   247 AA;  28303 MW;  B0D16C6DE1F4455D CRC64;
+     MVDREQLVQK ARLAEQAERY DDMAAAMKNV TELNEPLSNE ERNLLSVAYK NVVGARRSSW
+     RVISSIEQKT SADGNEKKIE MVRAYREKIE KELEAVCQDV LSLLDNYLIK NCSETQYESK
+     VFYLKMKGDY YRYLAEVATG EKRATVVESS EKAYSEAHEI SKEHMQPTHP IRLGLALNYS
+     VFYYEIQNAP EQACHLAKTA FDDAIAELDT LNEDSYKDST LIMQLLRDNL TLWTSDQQDD
+     DGGEGNN
+//
diff --git a/src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt b/src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt
new file mode 100644
index 000000000..23cfd58d6
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/uniprot_trembl.txt
@@ -0,0 +1,570 @@
+ID   E5KP32_HUMAN            Unreviewed;       535 AA.
+AC   E5KP32;
+DT   08-FEB-2011, integrated into UniProtKB/TrEMBL.
+DT   08-FEB-2011, sequence version 1.
+DT   29-MAY-2024, entry version 50.
+DE   RecName: Full=Adenine DNA glycosylase {ECO:0000256|ARBA:ARBA00022023, ECO:0000256|RuleBase:RU365096};
+DE            EC=3.2.2.31 {ECO:0000256|ARBA:ARBA00012045, ECO:0000256|RuleBase:RU365096};
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:ADP90939.1};
+DR   EMBL; HQ205467; ADP90939.1; -; Genomic_DNA.
+DR   EMBL; HQ205471; ADP90959.1; -; Genomic_DNA.
+DR   EMBL; HQ205478; ADP90994.1; -; Genomic_DNA.
+DR   EMBL; HQ205504; ADP91124.1; -; Genomic_DNA.
+DR   AlphaFoldDB; E5KP32; -.
+DR   PeptideAtlas; E5KP32; -.
+DR   CDD; cd03431; DNA_Glycosylase_C; 1.
+DR   CDD; cd00056; ENDO3c; 1.
+DR   Gene3D; 1.10.1670.10; Helix-hairpin-Helix base-excision DNA repair enzymes (C-terminal); 1.
+DR   Gene3D; 3.90.79.10; Nucleoside Triphosphate Pyrophosphohydrolase; 1.
+DR   InterPro; IPR011257; DNA_glycosylase.
+DR   InterPro; IPR004036; Endonuclease-III-like_CS2.
+DR   InterPro; IPR003651; Endonuclease3_FeS-loop_motif.
+DR   InterPro; IPR004035; Endouclease-III_FeS-bd_BS.
+DR   InterPro; IPR003265; HhH-GPD_domain.
+DR   InterPro; IPR023170; HhH_base_excis_C.
+DR   InterPro; IPR000445; HhH_motif.
+DR   InterPro; IPR044298; MIG/MutY.
+DR   InterPro; IPR029119; MutY_C.
+DR   InterPro; IPR015797; NUDIX_hydrolase-like_dom_sf.
+DR   InterPro; IPR000086; NUDIX_hydrolase_dom.
+DR   PANTHER; PTHR42944; ADENINE DNA GLYCOSYLASE; 1.
+DR   PANTHER; PTHR42944:SF1; ADENINE DNA GLYCOSYLASE; 1.
+DR   Pfam; PF00633; HHH; 1.
+DR   Pfam; PF00730; HhH-GPD; 1.
+DR   Pfam; PF14815; NUDIX_4; 1.
+DR   SMART; SM00478; ENDO3c; 1.
+DR   SMART; SM00525; FES; 1.
+DR   SUPFAM; SSF48150; DNA-glycosylase; 1.
+DR   SUPFAM; SSF55811; Nudix; 1.
+DR   PROSITE; PS00764; ENDONUCLEASE_III_1; 1.
+DR   PROSITE; PS01155; ENDONUCLEASE_III_2; 1.
+DR   PROSITE; PS51462; NUDIX; 1.
+PE   3: Inferred from homology;
+KW   4Fe-4S {ECO:0000256|ARBA:ARBA00022485};
+KW   DNA damage {ECO:0000256|ARBA:ARBA00022763, ECO:0000256|RuleBase:RU365096};
+KW   DNA repair {ECO:0000256|ARBA:ARBA00023204};
+KW   Glycosidase {ECO:0000256|ARBA:ARBA00023295, ECO:0000256|RuleBase:RU365096};
+KW   Hydrolase {ECO:0000256|ARBA:ARBA00022801};
+KW   Iron {ECO:0000256|ARBA:ARBA00023004, ECO:0000256|RuleBase:RU365096};
+KW   Iron-sulfur {ECO:0000256|ARBA:ARBA00023014};
+KW   Metal-binding {ECO:0000256|ARBA:ARBA00022723}.
+SQ   SEQUENCE   535 AA;  59080 MW;  4F7956A45A21226A CRC64;
+     MTPLVSRLSR LWAIMRKPRA AVGSGHRKQA ASQEGRQKHA KNNSQAKPSA CDGLARQPEE
+     VVLQASVSSY HLFRDVAEVT AFRGSLLSWY DQEKRDLPWR RRAEDEMDLD RRAYAVWVSE
+     VMLQQTQVAT VINYYTGWMQ KWPTLQDLAS ASLEEVNQLW AGLGYYSRGR RLQEGARKVV
+     EELGGHMPRT AETLQQLLPG VGRYTAGAIA SIAFGQATGV VDGNVARVLC RVRAIGADPS
+     STLVSQQLWG LAQQLVDPAR PGDFNQAAME LGATVCTPQR PLCSQCPVES LCRARQRVEQ
+     EQLLASGSLS GSPDVEECAP NTGHCHLCLP PSEPWDQTLG VVNFPRKASR KPPREESSAT
+     CVLEQPGALG AQILLVQRPN SGLLAGLWEF PSVTWEPSEQ LQRKALLQEL QRWAGPLPAT
+     HLRHLGEVVH TFSHIKLTYQ VYGLALEGQT PVTTVPPGAR WLTQEEFHTA AVSTAMKKVF
+     RVYQGQQPGT CMGSKRSQVS SPCSRKKPRM GQQVLDNFFR SHISTDAHSL NSAAQ
+//
+ID   A0A1U9X8M5_COW        Unreviewed;       395 AA.
+AC   A0A1U9X8M5;
+DT   07-JUN-2017, integrated into UniProtKB/TrEMBL.
+DT   07-JUN-2017, sequence version 1.
+DT   29-MAY-2024, entry version 41.
+DE   RecName: Full=Tripartite motif-containing protein 10 {ECO:0000256|ARBA:ARBA00014653};
+OS   Bos taurus (Cow).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9913 {ECO:0000313|EMBL:AQY77152.1};
+DR   EMBL; KY500679; AQY77151.1; -; Genomic_DNA.
+DR   EMBL; KY500680; AQY77152.1; -; Genomic_DNA.
+DR   RefSeq; NP_439893.2; NM_052828.2.
+DR   AlphaFoldDB; A0A1U9X8M5; -.
+DR   SMR; A0A1U9X8M5; -.
+DR   Antibodypedia; 26232; 191 antibodies from 19 providers.
+DR   DNASU; 10107; -.
+DR   GeneID; 10107; -.
+DR   CTD; 10107; -.
+DR   DisGeNET; 10107; -.
+DR   VEuPathDB; HostDB:ENSG00000204613; -.
+DR   OrthoDB; 3453019at2759; -.
+DR   ExpressionAtlas; A0A1U9X8M5; baseline and differential.
+DR   CDD; cd16593; RING-HC_TRIM10_C-IV; 1.
+DR   Gene3D; 2.60.120.920; -; 1.
+DR   Gene3D; 3.30.160.60; Classic Zinc Finger; 1.
+DR   Gene3D; 3.30.40.10; Zinc/RING finger domain, C3HC4 (zinc finger); 1.
+DR   InterPro; IPR001870; B30.2/SPRY.
+DR   InterPro; IPR043136; B30.2/SPRY_sf.
+DR   InterPro; IPR003879; Butyrophylin_SPRY.
+DR   InterPro; IPR013320; ConA-like_dom_sf.
+DR   InterPro; IPR006574; PRY.
+DR   InterPro; IPR042784; TRIM10_RING-HC.
+DR   InterPro; IPR000315; Znf_B-box.
+DR   InterPro; IPR018957; Znf_C3HC4_RING-type.
+DR   InterPro; IPR001841; Znf_RING.
+DR   InterPro; IPR013083; Znf_RING/FYVE/PHD.
+DR   InterPro; IPR017907; Znf_RING_CS.
+DR   PANTHER; PTHR24103; E3 UBIQUITIN-PROTEIN LIGASE TRIM; 1.
+DR   PANTHER; PTHR24103:SF329; TRIPARTITE MOTIF-CONTAINING PROTEIN 10; 1.
+DR   Pfam; PF13765; PRY; 1.
+DR   Pfam; PF00643; zf-B_box; 1.
+DR   Pfam; PF00097; zf-C3HC4; 1.
+DR   PRINTS; PR01407; BUTYPHLNCDUF.
+DR   SMART; SM00336; BBOX; 1.
+DR   SMART; SM00589; PRY; 1.
+DR   SMART; SM00184; RING; 1.
+DR   SUPFAM; SSF57845; B-box zinc-binding domain; 1.
+DR   SUPFAM; SSF49899; Concanavalin A-like lectins/glucanases; 1.
+DR   SUPFAM; SSF57850; RING/U-box; 1.
+DR   PROSITE; PS50188; B302_SPRY; 1.
+DR   PROSITE; PS50119; ZF_BBOX; 1.
+DR   PROSITE; PS00518; ZF_RING_1; 1.
+DR   PROSITE; PS50089; ZF_RING_2; 1.
+PE   3: Inferred from homology;
+KW   Metal-binding {ECO:0000256|ARBA:ARBA00022723};
+KW   Zinc {ECO:0000256|ARBA:ARBA00022833};
+KW   Zinc-finger {ECO:0000256|ARBA:ARBA00022771, ECO:0000256|PROSITE-
+KW   ProRule:PRU00024}.
+SQ   SEQUENCE   395 AA;  45252 MW;  EDEFCB7027B6C15D CRC64;
+     MASAASVTSL ADEVNCPICQ GTLREPVTID CGHNFCRACL TRYCEIPGPD LEESPTCPLC
+     KEPFRPGSFR PNWQLANVVE NIERLQLVST LGLGEEDVCQ EHGEKIYFFC EDDEMQLCVV
+     CREAGEHATH TMRFLEDAAA PYREQIHKCL KCLRKEREEI QEIQSRENKR MQVLLTQVST
+     KRQQVISEFA HLRKFLEEQQ SILLAQLESQ DGDILRQRDE FDLLVAGEIC RFSALIEELE
+     EKNERPAREL LTDIRSTLIR CETRKCRKPV AVSPELGQRI RDFPQQALPL QREMKMFLEK
+     LCFELDYEPA HISLDPQTSH PKLLLSEDHQ RAQFSYKWQN SPDNPQRFDR ATCVLAHTGI
+     TGGRHTWVWM ARVPGDSGCC QFCSPPSVLG TEVAA
+//
+ID   A0A7D5YZ42_HUMAN        Unreviewed;       106 AA.
+AC   A0A7D5YZ42;
+DT   02-DEC-2020, integrated into UniProtKB/TrEMBL.
+DT   02-DEC-2020, sequence version 1.
+DT   29-MAY-2024, entry version 12.
+DE   SubName: Full=Cytochrome P450 2C9 {ECO:0000313|EMBL:QLI62784.1};
+DE   Flags: Fragment;
+GN   Name=CYP2C9 {ECO:0000313|EMBL:QLI62784.1};
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:QLI62784.1};
+DR   EMBL; MN614169; QLI62674.1; -; Genomic_DNA.
+DR   EMBL; MN614175; QLI62680.1; -; Genomic_DNA.
+DR   EMBL; MN614279; QLI62784.1; -; Genomic_DNA.
+DR   AlphaFoldDB; A0A7D5YZ42; -.
+DR   PeptideAtlas; A0A7D5YZ42; -.
+DR   Gene3D; 1.10.630.10; Cytochrome P450; 1.
+DR   InterPro; IPR001128; Cyt_P450.
+DR   InterPro; IPR002401; Cyt_P450_E_grp-I.
+DR   InterPro; IPR036396; Cyt_P450_sf.
+DR   PANTHER; PTHR24300:SF336; CYTOCHROME P450 2C9; 1.
+DR   PANTHER; PTHR24300; CYTOCHROME P450 508A4-RELATED; 1.
+DR   Pfam; PF00067; p450; 1.
+DR   PRINTS; PR00463; EP450I.
+DR   SUPFAM; SSF48264; Cytochrome P450; 1.
+PE   3: Inferred from homology;
+KW   Heme {ECO:0000256|ARBA:ARBA00022617}; Iron {ECO:0000256|ARBA:ARBA00023004};
+KW   Metal-binding {ECO:0000256|ARBA:ARBA00022723}.
+SQ   SEQUENCE   106 AA;  12035 MW;  9894A14D4BE1A349 CRC64;
+     LSKVYGPVFT LYFGLKPIVV LHGYEAVKEA LIDLGEEFSG RGIFPLAERA NRGFGIVFSN
+     GKKWKEIRHF SLMTLRNFGM GKRSIEDRVQ EEARCLVEEL RKTKGG
+//
+ID   K4GY12_HUMAN            Unreviewed;       226 AA.
+AC   K4GY12;
+DT   09-JAN-2013, integrated into UniProtKB/TrEMBL.
+DT   09-JAN-2013, sequence version 1.
+DT   29-MAY-2024, entry version 40.
+DE   RecName: Full=ATP synthase subunit a {ECO:0000256|ARBA:ARBA00021312, ECO:0000256|RuleBase:RU004450};
+GN   Name=ATP6 {ECO:0000313|EMBL:AFP96372.1};
+OS   Homo sapiens (Human).
+OG   Mitochondrion {ECO:0000313|EMBL:AFP96372.1}.
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:AFP96372.1};
+DR   EMBL; JX289125; AFP96372.1; -; Genomic_DNA.
+DR   EMBL; MF621083; ATP05077.1; -; Genomic_DNA.
+DR   AlphaFoldDB; K4GY12; -.
+DR   PeptideAtlas; K4GY12; -.
+DR   ChiTaRS; ATP6; human.
+DR   CDD; cd00310; ATP-synt_Fo_a_6; 1.
+DR   Gene3D; 1.20.120.220; ATP synthase, F0 complex, subunit A; 1.
+DR   InterPro; IPR000568; ATP_synth_F0_asu.
+DR   InterPro; IPR023011; ATP_synth_F0_asu_AS.
+DR   InterPro; IPR045083; ATP_synth_F0_asu_bact/mt.
+DR   InterPro; IPR035908; F0_ATP_A_sf.
+DR   NCBIfam; TIGR01131; ATP_synt_6_or_A; 1.
+DR   PANTHER; PTHR11410; ATP SYNTHASE SUBUNIT A; 1.
+DR   PANTHER; PTHR11410:SF0; ATP SYNTHASE SUBUNIT A; 1.
+DR   Pfam; PF00119; ATP-synt_A; 1.
+DR   PRINTS; PR00123; ATPASEA.
+DR   SUPFAM; SSF81336; F1F0 ATP synthase subunit A; 1.
+DR   PROSITE; PS00449; ATPASE_A; 1.
+PE   3: Inferred from homology;
+KW   ATP synthesis {ECO:0000256|ARBA:ARBA00023310};
+KW   CF(0) {ECO:0000256|ARBA:ARBA00022547};
+KW   Hydrogen ion transport {ECO:0000256|ARBA:ARBA00022781};
+KW   Ion transport {ECO:0000256|ARBA:ARBA00023065};
+KW   Membrane {ECO:0000256|ARBA:ARBA00023136, ECO:0000256|SAM:Phobius};
+KW   Mitochondrion {ECO:0000256|ARBA:ARBA00023128, ECO:0000313|EMBL:AFP96372.1};
+KW   Mitochondrion inner membrane {ECO:0000256|ARBA:ARBA00022792};
+KW   Transmembrane {ECO:0000256|ARBA:ARBA00022692, ECO:0000256|SAM:Phobius};
+KW   Transmembrane helix {ECO:0000256|ARBA:ARBA00022989,
+KW   ECO:0000256|SAM:Phobius}; Transport {ECO:0000256|ARBA:ARBA00022448}.
+SQ   SEQUENCE   226 AA;  24785 MW;  7211E3A429C0D966 CRC64;
+     MNENLFASFI APTILGLPAA VLIILFPPLL IPTSKYLINN RLITTQQWLI KLTSKQMMAM
+     HNTKGRTWSL MLVSLIIFIA TTNLLGLLPH SFTPTTQLSM NLAMAIPLWV GAVIMGFRSK
+     IKNALAHFLP QGTPTPLIPM LVIIETISLL IQPMALAVRL TANITAGHLL MHLIGSATLA
+     MSTINLPSTL IIFTILILLT ILEIAVALIQ AYVFTLLVSL YLHDNT
+//
+ID   A0A383S2L7_HUMAN        Unreviewed;       366 AA.
+AC   A0A383S2L7;
+DT   07-NOV-2018, integrated into UniProtKB/TrEMBL.
+DT   07-NOV-2018, sequence version 1.
+DT   29-MAY-2024, entry version 26.
+DE   SubName: Full=MHC class I antigen {ECO:0000313|EMBL:SYY42737.1};
+GN   Name=HLA-C {ECO:0000313|EMBL:SYY42737.1};
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:SYY42737.1};
+DR   EMBL; MK005641; AZL48948.1; -; Genomic_DNA.
+DR   EMBL; LS992437; SYY42737.1; -; Genomic_DNA.
+DR   AlphaFoldDB; A0A383S2L7; -.
+DR   PeptideAtlas; A0A383S2L7; -.
+DR   ChiTaRS; HLA-C; human.
+DR   CDD; cd21025; IgC1_MHC_Ib_HLA-Cw3-4; 1.
+DR   Gene3D; 2.60.40.10; Immunoglobulins; 1.
+DR   Gene3D; 3.30.500.10; MHC class I-like antigen recognition-like; 1.
+DR   InterPro; IPR007110; Ig-like_dom.
+DR   InterPro; IPR036179; Ig-like_dom_sf.
+DR   InterPro; IPR013783; Ig-like_fold.
+DR   InterPro; IPR003006; Ig/MHC_CS.
+DR   InterPro; IPR003597; Ig_C1-set.
+DR   InterPro; IPR011161; MHC_I-like_Ag-recog.
+DR   InterPro; IPR037055; MHC_I-like_Ag-recog_sf.
+DR   InterPro; IPR011162; MHC_I/II-like_Ag-recog.
+DR   InterPro; IPR001039; MHC_I_a_a1/a2.
+DR   InterPro; IPR010579; MHC_I_a_C.
+DR   PANTHER; PTHR16675:SF252; HLA CLASS I HISTOCOMPATIBILITY ANTIGEN, C ALPHA CHAIN; 1.
+DR   PANTHER; PTHR16675; MHC CLASS I-RELATED; 1.
+DR   Pfam; PF07654; C1-set; 1.
+DR   Pfam; PF00129; MHC_I; 1.
+DR   Pfam; PF06623; MHC_I_C; 1.
+DR   PRINTS; PR01638; MHCCLASSI.
+DR   SMART; SM00407; IGc1; 1.
+DR   SUPFAM; SSF48726; Immunoglobulin; 1.
+DR   SUPFAM; SSF54452; MHC antigen-recognition domain; 1.
+DR   PROSITE; PS50835; IG_LIKE; 1.
+DR   PROSITE; PS00290; IG_MHC; 1.
+PE   3: Inferred from homology;
+KW   Disulfide bond {ECO:0000256|ARBA:ARBA00023157};
+KW   Glycoprotein {ECO:0000256|ARBA:ARBA00023180};
+KW   Membrane {ECO:0000256|SAM:Phobius};
+KW   Signal {ECO:0000256|ARBA:ARBA00022729, ECO:0000256|SAM:SignalP};
+KW   Transmembrane {ECO:0000256|SAM:Phobius};
+KW   Transmembrane helix {ECO:0000256|SAM:Phobius}.
+SQ   SEQUENCE   366 AA;  40967 MW;  8482C454FA80E378 CRC64;
+     MRVMAPRTLI LLLSGALALT ETWACSHSMR YFSTSVSRPG RWEPRFIAVG YVDDTQFVRF
+     DSDAASPRGE PRAPWVEQEG PEYWDRETQK YKRQAQTDRV SLRNLRGYYN QSEAGSHTLQ
+     WMFGCDLGPD GRLLRGYDQS AYDGKDYIAL NEDLRSWTAA DTAAQITQRK WEAAREAEQR
+     RAYLEGTCVE WLRRYLENGK ETLQRAEHPK THVTHHPVSD HEATLRCWAL GFYPAEITLT
+     WQWDGEDQTQ DTELVETRPA GDGTFQKWAA VVVPSGEEQR YTCHVQHEGL PEPLTLRWEP
+     SSQPTIPIVG IVAGLAVLAV LAVLGAVVAV VMCRRKSSGG KGGSCSQAAS SNSAQGSDES
+     LIACKA
+//
+ID   O19554_HUMAN            Unreviewed;       362 AA.
+AC   O19554;
+DT   01-JAN-1998, integrated into UniProtKB/TrEMBL.
+DT   01-JAN-1998, sequence version 1.
+DT   29-MAY-2024, entry version 165.
+DE   SubName: Full=MHC class I antigen {ECO:0000313|EMBL:AAC17467.1};
+GN   Name=HLA-B {ECO:0000313|EMBL:AAC17467.1};
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:AAC17467.1};
+DR   EMBL; AF016641; AAC17467.1; -; mRNA.
+DR   EMBL; KX555571; ART85731.1; -; Genomic_DNA.
+DR   EMBL; LT618830; SCQ83646.1; -; Genomic_DNA.
+DR   PIR; S24434; S24434.
+DR   PIR; S24435; S24435.
+DR   PIR; S24436; S24436.
+DR   PIR; S24437; S24437.
+DR   PIR; S24439; S24439.
+DR   PIR; S24440; S24440.
+DR   AlphaFoldDB; O19554; -.
+DR   PeptideAtlas; O19554; -.
+DR   ChiTaRS; HLA-B; human.
+DR   CDD; cd21026; IgC1_MHC_Ia_HLA-B; 1.
+DR   Gene3D; 2.60.40.10; Immunoglobulins; 1.
+DR   Gene3D; 3.30.500.10; MHC class I-like antigen recognition-like; 1.
+DR   InterPro; IPR007110; Ig-like_dom.
+DR   InterPro; IPR036179; Ig-like_dom_sf.
+DR   InterPro; IPR013783; Ig-like_fold.
+DR   InterPro; IPR003006; Ig/MHC_CS.
+DR   InterPro; IPR003597; Ig_C1-set.
+DR   InterPro; IPR011161; MHC_I-like_Ag-recog.
+DR   InterPro; IPR037055; MHC_I-like_Ag-recog_sf.
+DR   InterPro; IPR011162; MHC_I/II-like_Ag-recog.
+DR   InterPro; IPR001039; MHC_I_a_a1/a2.
+DR   InterPro; IPR010579; MHC_I_a_C.
+DR   PANTHER; PTHR16675:SF270; HLA CLASS I HISTOCOMPATIBILITY ANTIGEN, B ALPHA CHAIN; 1.
+DR   PANTHER; PTHR16675; MHC CLASS I-RELATED; 1.
+DR   Pfam; PF07654; C1-set; 1.
+DR   Pfam; PF00129; MHC_I; 1.
+DR   Pfam; PF06623; MHC_I_C; 1.
+DR   PRINTS; PR01638; MHCCLASSI.
+DR   SMART; SM00407; IGc1; 1.
+DR   SUPFAM; SSF48726; Immunoglobulin; 1.
+DR   SUPFAM; SSF54452; MHC antigen-recognition domain; 1.
+DR   PROSITE; PS50835; IG_LIKE; 1.
+DR   PROSITE; PS00290; IG_MHC; 1.
+PE   2: Evidence at transcript level;
+KW   Disulfide bond {ECO:0000256|ARBA:ARBA00023157};
+KW   Glycoprotein {ECO:0000256|ARBA:ARBA00023180};
+KW   Membrane {ECO:0000256|SAM:Phobius};
+KW   Signal {ECO:0000256|ARBA:ARBA00022729, ECO:0000256|SAM:SignalP};
+KW   Transmembrane {ECO:0000256|SAM:Phobius};
+KW   Transmembrane helix {ECO:0000256|SAM:Phobius}.
+SQ   SEQUENCE   362 AA;  40438 MW;  8BAA65B28D3BA262 CRC64;
+     MRVTAPRTVL LLLSGALALT ETWAGSHSMR YFYTAMSRPG RGEPRFISVG YVDDTQFVRF
+     DSDAASPREE PRAPWIEQEG PEYWDRNTQI CKTNTQTYRE SLRNLRGYYN QSEAGSHTLQ
+     RMYGCDVGPD GRLLRGHDQY AYDGKDYIAL NEDLSSWTAA DTAAQITQRK WEAAREAEQL
+     RAYLEGLCVE WLRRHLENGK ETLQRADPPK THVTHHPISD HEATLRCWAL GFYPAEITLT
+     WQRDGEDQTQ DTELVETRPA GDRTFQKWAA VVVPSGEEQR YTCHVQHEGL PKPLTLRWEP
+     SSQSTIPIVG IVAGLAVLAV VVIGAVVATV MCRRKSSGGK GGSYSQAASS DSAQGSDVSL
+     TA
+//
+ID   H6WGQ1_HUMAN            Unreviewed;       603 AA.
+AC   H6WGQ1;
+DT   18-APR-2012, integrated into UniProtKB/TrEMBL.
+DT   18-APR-2012, sequence version 1.
+DT   29-MAY-2024, entry version 65.
+DE   RecName: Full=NADH-ubiquinone oxidoreductase chain 5 {ECO:0000256|ARBA:ARBA00021096, ECO:0000256|RuleBase:RU003404};
+DE            EC=7.1.1.2 {ECO:0000256|ARBA:ARBA00012944, ECO:0000256|RuleBase:RU003404};
+GN   Name=ND5 {ECO:0000313|EMBL:AEY70993.1};
+OS   Homo sapiens (Human).
+OG   Mitochondrion {ECO:0000313|EMBL:AEY70993.1}.
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:AEY70993.1};
+DR   EMBL; JQ245734; AEY70993.1; -; Genomic_DNA.
+DR   EMBL; JQ245735; AEY71006.1; -; Genomic_DNA.
+DR   EMBL; JX135002; AFN20958.1; -; Genomic_DNA.
+DR   EMBL; KF450844; AGZ64615.1; -; Genomic_DNA.
+DR   EMBL; KJ446052; AHX46740.1; -; Genomic_DNA.
+DR   EMBL; KP763844; AKB98866.1; -; Genomic_DNA.
+DR   EMBL; MF437284; QBZ76936.1; -; Genomic_DNA.
+DR   AlphaFoldDB; H6WGQ1; -.
+DR   PeptideAtlas; H6WGQ1; -.
+DR   ChiTaRS; MT-ND5; human.
+DR   InterPro; IPR010934; NADH_DH_su5_C.
+DR   InterPro; IPR018393; NADHpl_OxRdtase_5_subgr.
+DR   InterPro; IPR001750; ND/Mrp_mem.
+DR   InterPro; IPR003945; NU5C-like.
+DR   InterPro; IPR001516; Proton_antipo_N.
+DR   NCBIfam; TIGR01974; NDH_I_L; 1.
+DR   PANTHER; PTHR42829; NADH-UBIQUINONE OXIDOREDUCTASE CHAIN 5; 1.
+DR   PANTHER; PTHR42829:SF2; NADH-UBIQUINONE OXIDOREDUCTASE CHAIN 5; 1.
+DR   Pfam; PF06455; NADH5_C; 1.
+DR   Pfam; PF00361; Proton_antipo_M; 1.
+DR   Pfam; PF00662; Proton_antipo_N; 1.
+DR   PRINTS; PR01434; NADHDHGNASE5.
+PE   3: Inferred from homology;
+KW   Electron transport {ECO:0000256|ARBA:ARBA00022982};
+KW   Membrane {ECO:0000256|ARBA:ARBA00023136, ECO:0000256|RuleBase:RU003404};
+KW   Mitochondrion {ECO:0000256|ARBA:ARBA00023128,
+KW   ECO:0000256|RuleBase:RU003404};
+KW   NAD {ECO:0000256|ARBA:ARBA00023027, ECO:0000256|RuleBase:RU003404};
+KW   Respiratory chain {ECO:0000256|ARBA:ARBA00022660};
+KW   Signal {ECO:0000256|SAM:SignalP};
+KW   Translocase {ECO:0000256|ARBA:ARBA00022967};
+KW   Transmembrane {ECO:0000256|ARBA:ARBA00022692,
+KW   ECO:0000256|RuleBase:RU003404};
+KW   Transmembrane helix {ECO:0000256|ARBA:ARBA00022989,
+KW   ECO:0000256|RuleBase:RU003404};
+KW   Transport {ECO:0000256|ARBA:ARBA00022448, ECO:0000256|RuleBase:RU003404};
+KW   Ubiquinone {ECO:0000256|RuleBase:RU003404}.
+SQ   SEQUENCE   603 AA;  66955 MW;  897749F5B5EA5860 CRC64;
+     MTMHTTMTAL TLTSLIPPIL TTLVNPNKKN SYPHYVKSIV ASTFIISLFP TTMFMCLDQE
+     VIISNWHWAT TQTTQLSLSF KLDYFSMMFI PVALFVTWSI MEFSLWYMNS DPNINQFFKY
+     LLIFLITMLI LVTANNLFQL FIGWEGVGIM SFLLISWWYA RADANTAAIQ AILYNRIGDI
+     GFILALAWFI LHSNSWDPQQ MALLNANPSL TPLLGLLLAA AGKSAQLGLH PWLPSAMEGP
+     TPVSALLHSS TMVVAGIFLL IRFHPLAENS PLIQTLTLCL GAITTLFAAV CALTQNDIKK
+     IVAFSTSSQL GLMMVTIGIN QPHLAFLHIC THAFFKAMLF MCSGSIIHNL NNEQDIRKMG
+     GLLKTMPLTS TSLTIGSLAL AGMPFLTGFY SKDHIIETAN MSYTNAWALS ITLIATSLTS
+     AYSTRMILLT LTGQPRFPTL TNINENNPTL LNPIKRLAAG SLFAGFLITN SISPASPFQT
+     TVPLYLKLTA LAVTFLGLLT ALDLNYLTNK LKMKSPLCTF YFSNMLGFYP SITHRTIPYL
+     GLLTSQNLPL LLLDLTWLEK LLPKTISQHQ ISTSIITSTQ KGMIKLYFLS FFFPLILTLL
+     LIT
+//
+ID   A0A1U9X8F2_HUMAN        Unreviewed;       406 AA.
+AC   A0A1U9X8F2;
+DT   07-JUN-2017, integrated into UniProtKB/TrEMBL.
+DT   07-JUN-2017, sequence version 1.
+DT   29-MAY-2024, entry version 45.
+DE   RecName: Full=RING-type E3 ubiquitin transferase {ECO:0000256|ARBA:ARBA00012483};
+DE            EC=2.3.2.27 {ECO:0000256|ARBA:ARBA00012483};
+OS   Homo sapiens (Human).
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:AQY77073.1};
+DR   EMBL; KY500599; AQY77071.1; -; Genomic_DNA.
+DR   EMBL; KY500600; AQY77072.1; -; Genomic_DNA.
+DR   EMBL; KY500601; AQY77073.1; -; Genomic_DNA.
+DR   RefSeq; NP_002922.2; NM_002931.3.
+DR   AlphaFoldDB; A0A1U9X8F2; -.
+DR   SMR; A0A1U9X8F2; -.
+DR   Antibodypedia; 1772; 641 antibodies from 36 providers.
+DR   DNASU; 6015; -.
+DR   GeneID; 6015; -.
+DR   KEGG; hsa:6015; -.
+DR   CTD; 6015; -.
+DR   DisGeNET; 6015; -.
+DR   VEuPathDB; HostDB:ENSG00000204227; -.
+DR   OMA; GAEDNCD; -.
+DR   OrthoDB; 460116at2759; -.
+DR   UniPathway; UPA00143; -.
+DR   ExpressionAtlas; A0A1U9X8F2; baseline and differential.
+DR   CDD; cd17166; RAWUL_RING1; 1.
+DR   CDD; cd16740; RING-HC_RING2; 1.
+DR   Gene3D; 3.30.40.10; Zinc/RING finger domain, C3HC4 (zinc finger); 1.
+DR   InterPro; IPR032443; RAWUL.
+DR   InterPro; IPR043540; RING1/RING2.
+DR   InterPro; IPR001841; Znf_RING.
+DR   InterPro; IPR013083; Znf_RING/FYVE/PHD.
+DR   InterPro; IPR017907; Znf_RING_CS.
+DR   PANTHER; PTHR46076:SF5; E3 UBIQUITIN-PROTEIN LIGASE RING1; 1.
+DR   PANTHER; PTHR46076; E3 UBIQUITIN-PROTEIN LIGASE RING1 / RING 2 FAMILY MEMBER; 1.
+DR   Pfam; PF16207; RAWUL; 1.
+DR   Pfam; PF13923; zf-C3HC4_2; 1.
+DR   SMART; SM00184; RING; 1.
+DR   SUPFAM; SSF57850; RING/U-box; 1.
+DR   PROSITE; PS00518; ZF_RING_1; 1.
+DR   PROSITE; PS50089; ZF_RING_2; 1.
+PE   4: Predicted;
+KW   Metal-binding {ECO:0000256|ARBA:ARBA00022723};
+KW   Nucleus {ECO:0000256|ARBA:ARBA00023242};
+KW   Zinc {ECO:0000256|ARBA:ARBA00022833};
+KW   Zinc-finger {ECO:0000256|ARBA:ARBA00022771, ECO:0000256|PROSITE-
+KW   ProRule:PRU00175}.
+SQ   SEQUENCE   406 AA;  42429 MW;  6959787479DE9DAB CRC64;
+     MTTPANAQNA SKTWELSLYE LHRTPQEAIM DGTEIAVSPR SLHSELMCPI CLDMLKNTMT
+     TKECLHRFCS DCIVTALRSG NKECPTCRKK LVSKRSLRPD PNFDALISKI YPSREEYEAH
+     QDRVLIRLSR LHNQQALSSS IEEGLRMQAM HRAQRVRRPI PGSDQTTTMS GGEGEPGEGE
+     GDGEDVSSDS APDSAPGPAP KRPRGGGAGG SSVGTGGGGT GGVGGGAGSE DSGDRGGTLG
+     GGTLGPPSPP GAPSPPEPGG EIELVFRPHP LLVEKGEYCQ TRYVKTTGNA TVDHLSKYLA
+     LRIALERRQQ QEAGEPGGPG GGASDTGGPD GCGGEGGGAG GGDGPEEPAL PSLEGVSEKQ
+     YTIYIAPGGG AFTTLNGSLT LELVNEKFWK VSRPLELCYA PTKDPK
+//
+ID   Q4F4R7_HUMAN            Unreviewed;       226 AA.
+AC   Q4F4R7;
+DT   30-AUG-2005, integrated into UniProtKB/TrEMBL.
+DT   30-AUG-2005, sequence version 1.
+DT   29-MAY-2024, entry version 122.
+DE   RecName: Full=ATP synthase subunit a {ECO:0000256|ARBA:ARBA00021312, ECO:0000256|RuleBase:RU004450};
+GN   Name=ATP6 {ECO:0000313|EMBL:AAZ00441.2};
+OS   Homo sapiens (Human).
+OG   Mitochondrion {ECO:0000313|EMBL:AAZ00441.2}.
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:AAZ00441.2};
+DR   EMBL; DQ112736; AAZ00441.2; -; Genomic_DNA.
+DR   EMBL; DQ305032; ABB99315.1; -; Genomic_DNA.
+DR   EMBL; DQ341077; ABC60775.1; -; Genomic_DNA.
+DR   EMBL; EU092680; ABU64269.1; -; Genomic_DNA.
+DR   EMBL; FJ625860; ACO92279.1; -; Genomic_DNA.
+DR   EMBL; GU455415; ADD20772.1; -; Genomic_DNA.
+DR   EMBL; JN655777; AEQ26443.1; -; Genomic_DNA.
+DR   EMBL; JQ044831; AEV47375.1; -; Genomic_DNA.
+DR   EMBL; JQ703621; AFF85036.1; -; Genomic_DNA.
+DR   EMBL; KC622073; AGJ95426.1; -; Genomic_DNA.
+DR   EMBL; KC911360; AGO91425.1; -; Genomic_DNA.
+DR   EMBL; KF011503; AGQ46203.1; -; Genomic_DNA.
+DR   EMBL; KF055329; AGS17760.1; -; Genomic_DNA.
+DR   EMBL; KF451436; AGZ72306.1; -; Genomic_DNA.
+DR   EMBL; KJ185406; AID07692.1; -; Genomic_DNA.
+DR   EMBL; KM101583; AIU57283.1; -; Genomic_DNA.
+DR   EMBL; KT819211; ALM03295.1; -; Genomic_DNA.
+DR   EMBL; MF055891; ASR95513.1; -; Genomic_DNA.
+DR   EMBL; MF621071; ATP04921.1; -; Genomic_DNA.
+DR   EMBL; KY797207; AUG82832.1; -; Genomic_DNA.
+DR   EMBL; MF381288; AUR39889.1; -; Genomic_DNA.
+DR   EMBL; MF696005; AUT79179.1; -; Genomic_DNA.
+DR   EMBL; MH981647; AYV90620.1; -; Genomic_DNA.
+DR   PeptideAtlas; Q4F4R7; -.
+DR   ChiTaRS; ATP6; human.
+DR   CDD; cd00310; ATP-synt_Fo_a_6; 1.
+DR   Gene3D; 1.20.120.220; ATP synthase, F0 complex, subunit A; 1.
+DR   InterPro; IPR000568; ATP_synth_F0_asu.
+DR   InterPro; IPR023011; ATP_synth_F0_asu_AS.
+DR   InterPro; IPR045083; ATP_synth_F0_asu_bact/mt.
+DR   InterPro; IPR035908; F0_ATP_A_sf.
+DR   NCBIfam; TIGR01131; ATP_synt_6_or_A; 1.
+DR   PANTHER; PTHR11410; ATP SYNTHASE SUBUNIT A; 1.
+DR   PANTHER; PTHR11410:SF0; ATP SYNTHASE SUBUNIT A; 1.
+DR   Pfam; PF00119; ATP-synt_A; 1.
+DR   PRINTS; PR00123; ATPASEA.
+DR   SUPFAM; SSF81336; F1F0 ATP synthase subunit A; 1.
+DR   PROSITE; PS00449; ATPASE_A; 1.
+PE   3: Inferred from homology;
+KW   ATP synthesis {ECO:0000256|ARBA:ARBA00023310};
+KW   CF(0) {ECO:0000256|ARBA:ARBA00022547};
+KW   Hydrogen ion transport {ECO:0000256|ARBA:ARBA00022781};
+KW   Ion transport {ECO:0000256|ARBA:ARBA00023065};
+KW   Membrane {ECO:0000256|ARBA:ARBA00023136, ECO:0000256|SAM:Phobius};
+KW   Mitochondrion {ECO:0000256|ARBA:ARBA00023128, ECO:0000313|EMBL:AAZ00441.2};
+KW   Mitochondrion inner membrane {ECO:0000256|ARBA:ARBA00022792};
+KW   Transmembrane {ECO:0000256|ARBA:ARBA00022692, ECO:0000256|SAM:Phobius};
+KW   Transmembrane helix {ECO:0000256|ARBA:ARBA00022989,
+KW   ECO:0000256|SAM:Phobius}; Transport {ECO:0000256|ARBA:ARBA00022448}.
+SQ   SEQUENCE   226 AA;  24747 MW;  ADC1F79724D46108 CRC64;
+     MNENLFASFI APTILGLPAA VLIILFPPLL IPTSKYLINN RLITTQQWLI KLTSKQMMAM
+     HNTKGRTWSL MLVSLIIFIA TTNLLGLLPH SFTPTTQLSM NLAMAIPLWA GAVIMGFRSK
+     IKNALAHFLP QGTPTSLIPM LVIIETISLL IQPMALAVRL TANITAGHLL MHLIGSATLA
+     MSTINLPSTL IIFTILILLT ILEIAVALIQ AYVFTLLVSL YLHDNT
+//
+ID   A4ZMD8_HUMAN            Unreviewed;       174 AA.
+AC   A4ZMD8;
+DT   29-MAY-2007, integrated into UniProtKB/TrEMBL.
+DT   29-MAY-2007, sequence version 1.
+DT   29-MAY-2024, entry version 75.
+DE   RecName: Full=NADH-ubiquinone oxidoreductase chain 6 {ECO:0000256|ARBA:ARBA00021095, ECO:0000256|RuleBase:RU004430};
+DE            EC=7.1.1.2 {ECO:0000256|ARBA:ARBA00012944, ECO:0000256|RuleBase:RU004430};
+GN   Name=ND6 {ECO:0000313|EMBL:ABO39784.1};
+OS   Homo sapiens (Human).
+OG   Mitochondrion {ECO:0000313|EMBL:ABO39784.1}.
+OC   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
+OC   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae;
+OC   Homo.
+OX   NCBI_TaxID=9606 {ECO:0000313|EMBL:ABO39784.1};
+DR   EMBL; EF184623; ABO39784.1; -; Genomic_DNA.
+DR   EMBL; GU296592; ADB05633.1; -; Genomic_DNA.
+DR   AlphaFoldDB; A4ZMD8; -.
+DR   PeptideAtlas; A4ZMD8; -.
+DR   ChiTaRS; MT-ND6; human.
+DR   InterPro; IPR001457; NADH_UbQ/plastoQ_OxRdtase_su6.
+DR   PANTHER; PTHR11435; NADH UBIQUINONE OXIDOREDUCTASE SUBUNIT ND6; 1.
+DR   PANTHER; PTHR11435:SF1; NADH-UBIQUINONE OXIDOREDUCTASE CHAIN 6; 1.
+DR   Pfam; PF00499; Oxidored_q3; 1.
+PE   3: Inferred from homology;
+KW   Electron transport {ECO:0000256|RuleBase:RU004430};
+KW   Membrane {ECO:0000256|RuleBase:RU004430};
+KW   Mitochondrion {ECO:0000256|RuleBase:RU004430, ECO:0000313|EMBL:ABO39784.1};
+KW   Mitochondrion inner membrane {ECO:0000256|ARBA:ARBA00022792};
+KW   NAD {ECO:0000256|RuleBase:RU004430};
+KW   Respiratory chain {ECO:0000256|RuleBase:RU004430};
+KW   Signal {ECO:0000256|SAM:SignalP};
+KW   Translocase {ECO:0000256|RuleBase:RU004430};
+KW   Transmembrane {ECO:0000256|RuleBase:RU004430};
+KW   Transmembrane helix {ECO:0000256|RuleBase:RU004430};
+KW   Transport {ECO:0000256|ARBA:ARBA00022448, ECO:0000256|RuleBase:RU004430};
+KW   Ubiquinone {ECO:0000256|RuleBase:RU004430}.
+SQ   SEQUENCE   174 AA;  18594 MW;  E50CD9EB4DAC600D CRC64;
+     MMYALFLLSV GLVMGFVGFS SKPSPIYGGL VLIVSGVVGC VIILNFGGGY MGLMVFLIYL
+     GGMMVVFGYT TAMAIEEYPE AWGSGVEVLV SVLVGLAMEV GLVLWVKEYD GVVVVVNFNS
+     VGSWMIYEGE GSGLIREDPI GAGALYDYGR WLVVATGWTL FVGVYIVIEI ARGN
+//
diff --git a/src/python/test/xrefs/parsers/flatfiles/vgnc.txt b/src/python/test/xrefs/parsers/flatfiles/vgnc.txt
new file mode 100644
index 000000000..b3b789376
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/vgnc.txt
@@ -0,0 +1,11 @@
+taxon_id	vgnc_id	symbol	name	locus_group	locus_type	status	location	location_sortable:	alias_symbol	alias_name	prev_symbol	prev_name	gene_family	gene_family_id	date_approved_reserved	date_symbol_changed	date_name_changed	date_modified	entrez_id	ensembl_gene_id	uniprot_ids
+9796	VGNC:15375	AP1M1	adaptor related protein complex 1 subunit mu 1	protein-coding gene	gene with protein product	Approved	21	021				"adaptor related protein complex 1 mu 1 subunit"			2017-08-03		2018-04-25	2018-04-25	100069477	ENSECAG00000019502	"F7E1T6"
+9796	VGNC:18055	FKBP8	FKBP prolyl isomerase 8	protein-coding gene	gene with protein product	Approved	21	021				"FK506 binding protein 8"			2017-08-03		2018-11-03	2018-11-03	100069569	ENSECAG00000012306	"F6RVH1"
+9796	VGNC:18834	HOMER3	homer scaffold protein 3	protein-coding gene	gene with protein product	Approved	21	021				"homer scaffolding protein 3"			2017-08-03		2018-03-02	2018-03-02	100146769	ENSECAG00000023105	"F6VIN3"
+9598	VGNC:14659	CYYR1	cysteine and tyrosine rich 1	protein-coding gene	gene with protein product	Approved	21	021	test_synonym						2017-01-05			2017-01-05	744581	ENSPTRG00000013812	"A0A2I3T968|H2QKV9"
+9598	VGNC:3738	DIP2A	disco interacting protein 2 homolog A	protein-coding gene	gene with protein product	Approved	21	021	DIP2						2015-11-23		2015-11-23	2015-11-23	458619	ENSPTRG00000014025	"A0A2I3RVQ1|A0A2I3S1A1|K7CC60"
+9598	VGNC:14660	DNAJC28	DnaJ heat shock protein family (Hsp40) member C28	protein-coding gene	gene with protein product	Approved	21	021							2017-01-05			2017-01-05	474095	ENSPTRG00000013870	"H2RAJ2"
+9598	VGNC:14661	DNMT3L	DNA methyltransferase 3 like	protein-coding gene	gene with protein product	Approved	21	021							2017-01-05		2016-06-28	2017-01-05	470099	ENSPTRG00000013973	"H2QL42"
+9598	VGNC:1158	DONSON	downstream neighbor of SON	protein-coding gene	gene with protein product	Approved	21	021							2015-11-10			2015-11-10	746652	ENSPTRG00000034273	"A0A2I3SLQ4|A0A2I3TCH4|G2HF01"
+9598	VGNC:1152	DSCAM	DS cell adhesion molecule	protein-coding gene	gene with protein product	Approved	21	021							2015-11-10		2016-05-16	2016-05-16	747803	ENSPTRG00000013922	"A0A2I3RHK5|A0A2I3T5L1|H2QL16"
+9796	VGNC:23003	MTREX	Mtr4 exosome RNA helicase	protein-coding gene	gene with protein product	Approved	21	021			"SKIV2L2"	"Ski2 like RNA helicase 2"			2017-08-03	2017-11-26	2017-11-26	2017-11-26	100051945	ENSECAG00000023723	"F6YYD8"
diff --git a/src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt b/src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt
new file mode 100644
index 000000000..48452ff20
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/xenopus_jamboree.txt
@@ -0,0 +1,12 @@
+XB-GENE-478054	trnt1	tRNA nucleotidyl transferase, CCA-adding, 1	ENSXETG00000025091
+XB-GENE-478064	foxh1.2	forkhead box H1, gene 2	ENSXETG00000005286
+XB-GENE-478074	nr5a2	nuclear receptor subfamily 5 group A member 2	ENSXETG00000000314
+XB-GENE-478084	tbx1	T-box 1	ENSXETG00000006304
+XB-GENE-478094	nr1d1	nuclear receptor subfamily 1 group D member 1	ENSXETG00000024397
+XB-GENE-478104	nucb1	nucleobindin 1	ENSXETG00000021229
+XB-GENE-478113	nsa2	NSA2, ribosome biogenesis homolog	ENSXETG00000005077
+XB-GENE-478121	csnk1a1	casein kinase 1 alpha 1	ENSXETG00000020861
+XB-GENE-478131	hoxc6	homeobox C6	ENSXETG00000023479
+XB-GENE-478141	hba1	hemoglobin subunit alpha 1	ENSXETG00000025664
+XB-GENE-940866	rtp3c	receptor (chemosensory) transporter protein 3 gene C [provisional]	ENSXETG00000019753
+XB-GENE-981482	or1e2l	conserved hypothetical olfactory receptor, 8 of 17	ENSXETG00000026609
diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt
new file mode 100644
index 000000000..3bc5d5b6b
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/zfin/aliases.txt
@@ -0,0 +1,10 @@
+ZDB-GENE-000125-12	Df(Chr03)c1033	c1033	c1033	SO:1000029
+ZDB-GENE-000125-12	Df(Chr03)c1033	c1033	Df(LG03)	SO:1000029
+ZDB-GENE-000125-12	Df(Chr03)c1033	c1033	Df(LG03)c1033	SO:1000029
+ZDB-ALT-000405-2	Df(Chr24:reck)w15	w15	Df(Chr24:reck)w15	SO:1000029
+ZDB-ALT-000405-2	Df(Chr24:reck)w15	w15	w15	SO:1000029
+ZDB-ALT-000712-2	Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra,hs6st3b,ramp1)b476	b476	b476	SO:1000029
+ZDB-ALT-000712-2	Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra,hs6st3b,ramp1)b476	b476	Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra)b476	SO:1000029
+ZDB-ALT-000712-2	Df(Chr9:epb41l5,ptpn4a,tmem177,pth2ra,hs6st3b,ramp1)b476	b476	moe	SO:1000029
+ZDB-GENE-000128-18	zc1Tg	zc1Tg	Tg(NBT:MAPT-GFP)	SO:0001218
+ZDB-GENE-000128-18	zc1Tg	zc1Tg	Tg(NBT:MAPT-GFP)zc1	SO:0001218
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt
new file mode 100644
index 000000000..e7f89f7c5
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/zfin/ensembl_1_to_1.txt
@@ -0,0 +1,10 @@
+ZDB-GENE-000112-47	SO:0001217	ppardb	ENSDARG00000009473
+ZDB-GENE-000125-12	SO:0001217	igfbp2a	ENSDARG00000052470
+ZDB-GENE-000125-4	SO:0001217	dlc	ENSDARG00000002336
+ZDB-GENE-000128-11	SO:0001217	dbx1b	ENSDARG00000001859
+ZDB-GENE-000128-8	SO:0001217	dbx1a	ENSDARG00000086393
+ZDB-GENE-000201-13	SO:0001217	anos1b	ENSDARG00000004932
+ZDB-GENE-000201-18	SO:0001217	pbx4	ENSDARG00000052150
+ZDB-GENE-000201-9	SO:0001217	anos1a	ENSDARG00000012896
+ZDB-GENE-000208-13	SO:0001217	crestin	ENSDARG00000105570
+ZDB-GENE-000208-17	SO:0001217	calr3a	ENSDARG00000103979
diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt
new file mode 100644
index 000000000..a6bda8e48
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/zfin/refseq.txt
@@ -0,0 +1,10 @@
+ZDB-GENE-000112-47	SO:0001217	ppardb	NP_571543
+ZDB-GENE-000112-47	SO:0001217	ppardb	XP_005167044
+ZDB-GENE-000112-47	SO:0001217	ppardb	XM_009303927
+ZDB-GENE-000112-47	SO:0001217	ppardb	XM_005166987
+ZDB-GENE-000112-47	SO:0001217	ppardb	XP_009302202
+ZDB-GENE-000112-47	SO:0001217	ppardb	NM_131468
+ZDB-GENE-000112-47	SO:0001217	ppardb	XP_009302203
+ZDB-GENE-000112-47	SO:0001217	ppardb	XM_009303928
+ZDB-GENE-000201-96	SO:0001217	igfbp2a	NP_571533
+ZDB-GENE-000201-96	SO:0001217	igfbp2a	NM_131458
diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt b/src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt
new file mode 100644
index 000000000..f41aba0ab
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/zfin/uniprot.txt
@@ -0,0 +1,10 @@
+ZDB-GENE-000112-47	SO:0001217	ppardb	A9C4A5
+ZDB-GENE-000125-12	SO:0001217	igfbp2a	Q9PTH3
+ZDB-GENE-000125-404	SO:0001217	dlc	A4JYS0
+ZDB-GENE-000125-4	SO:0001217	dlc	Q9IAT6
+ZDB-GENE-000128-11	SO:0001217	dbx1b	B3DG51
+ZDB-GENE-000128-11	SO:0001217	dbx1b	Q9PTU0
+ZDB-GENE-000128-13	SO:0001217	dbx2	A0A8M9PP76
+ZDB-GENE-000128-18	SO:0001217	dbx1a	B2GNV2
+ZDB-GENE-000128-18	SO:0001217	dbx1a	Q9PTU1
+ZDB-GENE-000201-9	SO:0001217	anos1b	Q1MT36
diff --git a/src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt b/src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt
new file mode 100644
index 000000000..89dc75d45
--- /dev/null
+++ b/src/python/test/xrefs/parsers/flatfiles/zfin_desc.txt
@@ -0,0 +1,9 @@
+ZDB-GENE-030131-3003	HNF1 homeobox Bb	hnf1bb	21	ZDB-REFCROSS-990707-1
+ZDB-GENE-030131-1077	hepatocyte nuclear factor 4, alpha	hnf4a	23	ZDB-REFCROSS-000320-1
+ZDB-GENE-040718-488	WD repeat domain, phosphoinositide interacting 2	wipi2	0	
+ZDB-GENE-070117-2473	wirbel	wir	0	
+ZDB-GENE-000710-5	WITHDRAWN:cripto	WITHDRAWN:cripto	0	
+ZDB-GENE-030516-5	WITHDRAWN:sb:cb476	WITHDRAWN:sb:cb476	0	
+ZDB-GENE-030131-8698	WITHDRAWN:wu:fa94g04	WITHDRAWN:wu:fa94g04	0
+ZDB-GENE-070117-2162	lawrence welk	wlk	0	
+ZDB-GENE-040426-2161	wntless Wnt ligand secretion mediator	wls	2	ZDB-REFCROSS-000320-1
diff --git a/src/python/test/xrefs/parsers/test_arrayexpress_parser.py b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py
new file mode 100644
index 000000000..db0379e08
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py
@@ -0,0 +1,110 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+from types import SimpleNamespace
+
+from ensembl.production.xrefs.parsers.ArrayExpressParser import ArrayExpressParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link
+
+# Constants
+SOURCE_ID_ARRAYEXPRESS = 1
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create an ArrayExpressParser instance
+@pytest.fixture
+def arrayexpress_parser() -> ArrayExpressParser:
+    return ArrayExpressParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(arrayexpress_parser: ArrayExpressParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = arrayexpress_parser.run(
+        {
+            "source_id": SOURCE_ID_ARRAYEXPRESS,
+            "species_id": SPECIES_ID_HUMAN,
+            "species_name": SPECIES_NAME_HUMAN,
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing ArrayExpress data"
+    assert (
+        f"Added {expected_xrefs} DIRECT xrefs" in result_message
+    ), f"{prefix}Expected 'Added {expected_xrefs} DIRECT xrefs' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id and species_id
+def test_arrayexpress_no_source_id(arrayexpress_parser: ArrayExpressParser, test_no_source_id: Callable[[ArrayExpressParser, int], None]) -> None:
+    test_no_source_id(arrayexpress_parser, SPECIES_ID_HUMAN)
+
+def test_arrayexpress_no_species_id(arrayexpress_parser: ArrayExpressParser, test_no_species_id: Callable[[ArrayExpressParser, int], None]) -> None:
+    test_no_species_id(arrayexpress_parser, SOURCE_ID_ARRAYEXPRESS)
+
+# Test case to check if parsing is skipped when no species name can be found
+def test_no_species_name(mock_xref_dbi: DBConnection, arrayexpress_parser: ArrayExpressParser) -> None:
+    result_code, result_message = arrayexpress_parser.run(
+        {
+            "source_id": SOURCE_ID_ARRAYEXPRESS,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing ArrayExpress data"
+    assert (
+        "Skipped. Could not find species ID to name mapping" in result_message
+    ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'"
+
+# Test case to check if an error is raised when no ArrayExpress database is provided
+def test_no_arrayexpress_db(arrayexpress_parser: ArrayExpressParser) -> None:
+    arrayexpress_parser.get_arrayexpress_db_url = MagicMock(return_value=None)
+
+    with pytest.raises(
+        AttributeError, match="Could not find ArrayExpress DB. Missing or unsupported project value."
+    ):
+        arrayexpress_parser.run(
+            {
+                "source_id": SOURCE_ID_ARRAYEXPRESS,
+                "species_id": SPECIES_ID_HUMAN,
+                "species_name": SPECIES_NAME_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing of valid ArrayExpress data
+def test_successful_parsing(mock_xref_dbi: DBConnection, arrayexpress_parser: ArrayExpressParser) -> None:
+    # Mock all needed methods
+    arrayexpress_parser.get_arrayexpress_db_url = MagicMock(return_value="mock_arrayexpress_db_url")
+    arrayexpress_data = [
+        {"stable_id": "ENSG00000139618"},
+        {"stable_id": "ENSG00000157764"},
+        {"stable_id": "ENSG00000198786"},
+        {"stable_id": "ENSG00000248378"},
+        {"stable_id": "ENSG00000248379"},
+    ]
+    arrayexpress_data_obj = [SimpleNamespace(**item) for item in arrayexpress_data]
+    arrayexpress_parser.get_arrayexpress_data = MagicMock(return_value=arrayexpress_data_obj)
+
+    # Run and validate parsing for ArrayExpress data
+    run_and_validate_parsing(arrayexpress_parser, mock_xref_dbi, 5)
+
+    # Check the row counts in the xref and gene_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_ARRAYEXPRESS}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 5)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "ENSG00000139618", "ENSG00000139618")
+    check_direct_xref_link(mock_xref_dbi, "gene", "ENSG00000157764", "ENSG00000157764")
+    check_direct_xref_link(mock_xref_dbi, "gene", "ENSG00000198786", "ENSG00000198786")
+
+    # Run and validate re-parsing for ArrayExpress data
+    run_and_validate_parsing(arrayexpress_parser, mock_xref_dbi, 5, "Re-parsing: ")
+
+    # Check the row counts in the xref and gene_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_ARRAYEXPRESS}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 5)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_ccds_parser.py b/src/python/test/xrefs/parsers/test_ccds_parser.py
new file mode 100644
index 000000000..1f7fe93e9
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_ccds_parser.py
@@ -0,0 +1,91 @@
+import pytest
+from unittest.mock import MagicMock, patch
+from typing import Callable
+from types import SimpleNamespace
+
+from ensembl.production.xrefs.parsers.CCDSParser import CCDSParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link
+
+# Constants
+SOURCE_ID_CCDS = 1
+SPECIES_ID_HUMAN = 9606
+
+# Fixture to create a CCDSParser instance
+@pytest.fixture
+def ccds_parser() -> CCDSParser:
+    return CCDSParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(ccds_parser: CCDSParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_direct_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = ccds_parser.run(
+        {
+            "source_id": SOURCE_ID_CCDS,
+            "species_id": SPECIES_ID_HUMAN,
+            "dba": "mock_ccds_db_url",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing CCDS data"
+    assert (
+        f"Parsed CCDS identifiers, added {expected_xrefs} xrefs and {expected_direct_xrefs} direct_xrefs" in result_message
+    ), f"{prefix}Expected 'Parsed CCDS identifiers, added {expected_xrefs} xrefs and {expected_direct_xrefs} direct_xrefs' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id and species_id
+def test_ccds_no_source_id(ccds_parser: CCDSParser, test_no_source_id: Callable[[CCDSParser, int], None]) -> None:
+    test_no_source_id(ccds_parser, SPECIES_ID_HUMAN)
+
+def test_ccds_no_species_id(ccds_parser: CCDSParser, test_no_species_id: Callable[[CCDSParser, int], None]) -> None:
+    test_no_species_id(ccds_parser, SOURCE_ID_CCDS)
+
+# Test case to check if an error is raised when no CCDS database is provided
+def test_no_ccds_db(ccds_parser: CCDSParser) -> None:
+    result_code, result_message = ccds_parser.run(
+        {
+            "source_id": SOURCE_ID_CCDS,
+            "species_id": SPECIES_ID_HUMAN,
+            "xref_dbi": MagicMock(),
+        }
+    )
+
+    assert result_code == 1, f"Errors when parsing CCDS data"
+    assert (
+        "Could not find CCDS DB." in result_message
+    ), f"Expected 'Could not find CCDS DB.' in result_message, but got: '{result_message}'"
+
+# Test case to check successful parsing of valid CCDS data
+def test_successful_parsing(mock_xref_dbi: DBConnection, ccds_parser: CCDSParser) -> None:
+    # Mock all needed methods
+    ccds_data = [
+        {"stable_id": "CCDS2.2", "dbprimary_acc": "ENST00000342066"},
+        {"stable_id": "CCDS3.1", "dbprimary_acc": "ENST00000327044"},
+        {"stable_id": "CCDS4.1", "dbprimary_acc": "ENST00000379410"},
+        {"stable_id": "CCDS5.1", "dbprimary_acc": "ENST00000379410"},
+        {"stable_id": "CCDS7.2", "dbprimary_acc": "ENST00000421241"},
+        {"stable_id": "CCDS7.2", "dbprimary_acc": "ENST00000379319"},
+    ]
+    ccds_data_obj = [SimpleNamespace(**item) for item in ccds_data]
+    ccds_parser.get_ccds_data = MagicMock(return_value=ccds_data_obj)
+
+    # Run and validate parsing for ArrayExpress data
+    run_and_validate_parsing(ccds_parser, mock_xref_dbi, 5, 6)
+
+    # Check the row counts in the xref and transcript_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}")
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 6)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "transcript", "ENST00000327044", "CCDS3.1")
+    check_direct_xref_link(mock_xref_dbi, "transcript", "ENST00000421241", "CCDS7.2")
+    check_direct_xref_link(mock_xref_dbi, "transcript", "ENST00000379319", "CCDS7.2")
+
+    # Run and validate re-parsing for ArrayExpress data
+    run_and_validate_parsing(ccds_parser, mock_xref_dbi, 5, 6, "Re-parsing: ")
+
+    # Check the row counts in the xref and transcript_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}")
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 6)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_dbass_parser.py b/src/python/test/xrefs/parsers/test_dbass_parser.py
new file mode 100644
index 000000000..c6ec23967
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_dbass_parser.py
@@ -0,0 +1,147 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.DBASSParser import DBASSParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_synonym, check_direct_xref_link
+
+# Constants
+SOURCE_ID_DBASS3 = 1
+SOURCE_ID_DBASS5 = 2
+SPECIES_ID_HUMAN = 9606
+EXPECTED_NUMBER_OF_COLUMNS = 23
+
+# Fixture to create a DBASSParser instance
+@pytest.fixture
+def dbass_parser() -> DBASSParser:
+    return DBASSParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(dbass_parser: DBASSParser, mock_xref_dbi: DBConnection, source_id: int, file: str, expected_direct_xrefs: int, expected_skipped_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = dbass_parser.run(
+        {
+            "source_id": source_id,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": f"parsers/flatfiles/{file}.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing {file.upper()} data"
+    assert (
+        f"{expected_direct_xrefs} direct xrefs successfully processed" in result_message
+    ), f"{prefix}Expected '{expected_direct_xrefs} direct xrefs successfully processed' in result_message, but got: '{result_message}'"
+    assert (
+        f"Skipped {expected_skipped_xrefs} unmapped xrefs" in result_message
+    ), f"{prefix}Expected 'Skipped {expected_skipped_xrefs} unmapped xrefs' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_dbass_no_source_id(dbass_parser: DBASSParser, test_no_source_id: Callable[[DBASSParser, int], None]) -> None:
+    test_no_source_id(dbass_parser, SPECIES_ID_HUMAN)
+
+def test_dbass_no_species_id(dbass_parser: DBASSParser, test_no_species_id: Callable[[DBASSParser, int], None]) -> None:
+    test_no_species_id(dbass_parser, SOURCE_ID_DBASS3)
+
+def test_dbass_no_file(dbass_parser: DBASSParser, test_no_file: Callable[[DBASSParser, int, int], None]) -> None:
+    test_no_file(dbass_parser, SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_dbass_file_not_found(dbass_parser: DBASSParser, test_file_not_found: Callable[[DBASSParser, int, int], None]) -> None:
+    test_file_not_found(dbass_parser, SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_dbass_empty_file(dbass_parser: DBASSParser, test_empty_file: Callable[[DBASSParser, str, int, int], None]) -> None:
+    test_empty_file(dbass_parser, 'DBASS', SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the header has insufficient columns
+def test_insufficient_header_columns(dbass_parser: DBASSParser) -> None:
+    mock_file = io.StringIO("Id,GeneSymbol,GeneFullName,EnsemblReference\n")
+    dbass_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in DBASS file"):
+        dbass_parser.run(
+            {
+                "source_id": SOURCE_ID_DBASS3,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Parametrized test case to check if an error is raised for various malformed headers
+@pytest.mark.parametrize(
+    "header", [
+        ("GeneId,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"),
+        ("Id,GeneSymbols,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"),
+        ("Id,GeneSymbol,GeneFullName,EnsemblRef,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"),
+    ],
+    ids=["first column", "second column", "fourth column"],
+)
+def test_malformed_headers(dbass_parser: DBASSParser, header: str) -> None:
+    mock_file = io.StringIO(header)
+    dbass_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in DBASS file"):
+        dbass_parser.run(
+            {
+                "source_id": SOURCE_ID_DBASS3,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file has insufficient columns
+def test_insufficient_columns(dbass_parser: DBASSParser) -> None:
+    mock_file = io.StringIO()
+    mock_file.write(
+        "Id,GeneSymbol,GeneFullName,EnsemblReference,Phenotype,OmimReference,Mutation,Location,AuthenticAberrantDistance,ReadingFrameChange,NucleotideSequence,InTerminalExon,Comment,MutationCoordinates,AberrantSpliceSiteCoordinates,MaximumEntropyModelAuthentic,MaximumEntropyModelCryptic,FirstOrderMarkovModelAuthentic,FirstOrderMarkovModelCryptic,WeightMatrixModelAuthentic,WeightMatrixModelCryptic,PubMedReference,ReferenceText\n"
+    )
+    mock_file.write("1,GNAS complex locus,ENSG00000087460,Hereditary osteodystrophy,103580\n")
+    mock_file.seek(0)
+
+    dbass_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="has an incorrect number of columns"):
+        dbass_parser.run(
+            {
+                "source_id": SOURCE_ID_DBASS3,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing
+def test_successful_parsing(mock_xref_dbi: DBConnection, dbass_parser: DBASSParser) -> None:
+    # Run and validate parsing for DBASS3 and DBASS5 files
+    run_and_validate_parsing(dbass_parser, mock_xref_dbi, SOURCE_ID_DBASS3, "dbass3", 6, 1)
+    run_and_validate_parsing(dbass_parser, mock_xref_dbi, SOURCE_ID_DBASS5, "dbass5", 6, 0)
+
+    # Check the row counts in the xref, gene_direct_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS3}")
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS5}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 12)
+    check_row_count(mock_xref_dbi, "synonym", 3)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "2", "ENSG00000130164")
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "2", SOURCE_ID_DBASS3, "LDLT")
+    check_synonym(mock_xref_dbi, "3", SOURCE_ID_DBASS3, "LDLT")
+    check_synonym(mock_xref_dbi, "4", SOURCE_ID_DBASS3, "LDLT")
+
+    # Run and validate re-parsing for DBASS3 file
+    run_and_validate_parsing(dbass_parser, mock_xref_dbi, SOURCE_ID_DBASS3, "dbass3", 6, 1, "Re-parsing: ")
+
+    # Check the row counts in the xref, gene_direct_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS3}")
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_DBASS5}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 12)
+    check_row_count(mock_xref_dbi, "synonym", 3)
diff --git a/src/python/test/xrefs/parsers/test_entrezgene_parser.py b/src/python/test/xrefs/parsers/test_entrezgene_parser.py
new file mode 100644
index 000000000..f0d31f8ae
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_entrezgene_parser.py
@@ -0,0 +1,157 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.EntrezGeneParser import EntrezGeneParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_synonym
+
+# Constants
+SOURCE_ID_ENTREZGENE = 1
+SOURCE_ID_WIKIGENE = 2
+SPECIES_ID_HUMAN = 9606
+EXPECTED_NUMBER_OF_COLUMNS = 16
+
+# Fixture to create an EntrezGeneParser instance
+@pytest.fixture
+def entrezgene_parser() -> EntrezGeneParser:
+    return EntrezGeneParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(entrezgene_parser: EntrezGeneParser, mock_xref_dbi: DBConnection, expected_entrez_xrefs: int, expected_wiki_xrefs: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = entrezgene_parser.run(
+        {
+            "source_id": SOURCE_ID_ENTREZGENE,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "parsers/flatfiles/entrezgene.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing EntrezGene data"
+    assert (
+        f"{expected_entrez_xrefs} EntrezGene Xrefs and {expected_wiki_xrefs} WikiGene Xrefs added with {expected_synonyms} synonyms" in result_message
+    ), f"{prefix}Expected '{expected_entrez_xrefs} EntrezGene Xrefs and {expected_wiki_xrefs} WikiGene Xrefs added with {expected_synonyms} synonyms' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_entrezgene_no_source_id(entrezgene_parser: EntrezGeneParser, test_no_source_id: Callable[[EntrezGeneParser, int], None]) -> None:
+    test_no_source_id(entrezgene_parser, SPECIES_ID_HUMAN)
+
+def test_entrezgene_no_species_id(entrezgene_parser: EntrezGeneParser, test_no_species_id: Callable[[EntrezGeneParser, int], None]) -> None:
+    test_no_species_id(entrezgene_parser, SOURCE_ID_ENTREZGENE)
+
+def test_entrezgene_no_file(entrezgene_parser: EntrezGeneParser, test_no_file: Callable[[EntrezGeneParser, int, int], None]) -> None:
+    test_no_file(entrezgene_parser, SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_entrezgene_file_not_found(entrezgene_parser: EntrezGeneParser, test_file_not_found: Callable[[EntrezGeneParser, int, int], None]) -> None:
+    test_file_not_found(entrezgene_parser, SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_entrezgene_empty_file(entrezgene_parser: EntrezGeneParser, test_empty_file: Callable[[EntrezGeneParser, str, int, int], None]) -> None:
+    test_empty_file(entrezgene_parser, 'EntrezGene', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the header has insufficient columns
+def test_insufficient_header_columns(entrezgene_parser: EntrezGeneParser) -> None:
+    mock_file = io.StringIO("#tax_id\tgeneid\tsymbol\n")
+    entrezgene_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in EntrezGene file"):
+        entrezgene_parser.run(
+            {
+                "source_id": SOURCE_ID_ENTREZGENE,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Parametrized test case to check if an error is raised for various malformed headers
+@pytest.mark.parametrize(
+    "header", [
+        ("tax_ids\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneIDs\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbols\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocuTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSyn\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdb_Xrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchr\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmapp_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription:\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype__of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomen_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tstatus\tOther_designations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tdesignations\tModification_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tMod_date\tFeature_type\n"),
+        ("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeaturetype\n"),
+    ],
+    ids=[
+        "tax_id column", "gene_id column", "symbol column", "locus_tag column", "synonyms column",
+        "db_xrefs column", "chromosome column", "map_location column", "description column",
+        "type_of_gene column", "symbol_nomen_auth column", "full_name column", "nomen_status column",
+        "other_designations column", "mofification_date column", "feature_type column"
+    ],
+)
+def test_malformed_headers(entrezgene_parser: EntrezGeneParser, header: str) -> None:
+    mock_file = io.StringIO(header)
+    entrezgene_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in EntrezGene file"):
+        entrezgene_parser.run(
+            {
+                "source_id": SOURCE_ID_ENTREZGENE,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file has insufficient columns
+def test_insufficient_columns(entrezgene_parser: EntrezGeneParser) -> None:
+    mock_file = io.StringIO()
+    mock_file.write("#tax_id\tGeneID\tSymbol\tLocusTag\tSynonyms\tdbXrefs\tchromosome\tmap_location\tdescription\ttype_of_gene\tSymbol_from_nomenclature_authority\tFull_name_from_nomenclature_authority\tNomenclature_status\tOther_designations\tModification_date\tFeature_type\n")
+    mock_file.write("9606\t1\tA1BG\t-\tA1B|ABG|GAB|HYST2477\n")
+    mock_file.seek(0)
+
+    entrezgene_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="has an incorrect number of columns"):
+        entrezgene_parser.run(
+            {
+                "source_id": SOURCE_ID_ENTREZGENE,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing of valid EntrezGene data
+def test_successful_parsing(mock_xref_dbi: DBConnection, entrezgene_parser: EntrezGeneParser) -> None:
+    entrezgene_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_WIKIGENE)
+
+    # Run and validate parsing for EntrezGene file
+    run_and_validate_parsing(entrezgene_parser, mock_xref_dbi, 10, 10, 26)
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}")
+    check_row_count(mock_xref_dbi, "synonym", 26)
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "A2MD")
+    check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "CPAMD5")
+    check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "FWP007")
+    check_synonym(mock_xref_dbi, "2", SOURCE_ID_ENTREZGENE, "S863-7")
+
+    # Run and validate parsing for EntrezGene file
+    run_and_validate_parsing(entrezgene_parser, mock_xref_dbi, 10, 10, 26, "Re-parsing: ")
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}")
+    check_row_count(mock_xref_dbi, "synonym", 26)
diff --git a/src/python/test/xrefs/parsers/test_hgnc_parser.py b/src/python/test/xrefs/parsers/test_hgnc_parser.py
new file mode 100644
index 000000000..7f920779f
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_hgnc_parser.py
@@ -0,0 +1,182 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable, Dict
+
+from ensembl.production.xrefs.parsers.HGNCParser import HGNCParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_synonym
+
+# Constants
+SOURCE_ID_HGNC = 1
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+SOURCE_ID_CCDS = 2
+SOURCE_ID_ENTREZGENE = 3
+SOURCE_ID_REFSEQ = 4
+SOURCE_ID_ENSEMBL_MANUAL = 5
+SOURCE_ID_LRG = 6
+SOURCE_ID_GENECARDS = 7
+SOURCE_ID_DESC_ONLY = 8
+
+# Fixture to create an HGNCParser instance
+@pytest.fixture
+def hgnc_parser() -> HGNCParser:
+    return HGNCParser(True)
+
+# Mock for get_source_id_for_source_name
+def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection, desc: str = None) -> int:
+    source_mapping = {
+        "ccds": SOURCE_ID_CCDS,
+        "entrezgene_manual": SOURCE_ID_ENTREZGENE,
+        "refseq_manual": SOURCE_ID_REFSEQ,
+        "ensembl_manual": SOURCE_ID_ENSEMBL_MANUAL,
+        "lrg_hgnc_notransfer": SOURCE_ID_LRG,
+        "genecards": SOURCE_ID_GENECARDS,
+        "desc_only": SOURCE_ID_DESC_ONLY,
+    }
+
+    if source_name == "HGNC" and desc:
+        return source_mapping.get(desc, SOURCE_ID_HGNC)
+
+    return source_mapping.get(source_name.lower(), SOURCE_ID_HGNC)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(hgnc_parser: HGNCParser, mock_xref_dbi: DBConnection, expected_xrefs: Dict[str, int], expected_mismatch: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = hgnc_parser.run(
+        {
+            "source_id": SOURCE_ID_HGNC,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "parsers/flatfiles/hgnc.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing HGNC data"
+    for count_type, count in expected_xrefs.items():
+        assert f"{count_type}\t{count}" in result_message, f"{prefix}Expected '{count_type}\t{count}' in result_meesgae, but got: '{result_message}'"
+
+    assert (
+        f"{expected_synonyms} synonyms added" in result_message
+    ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_mismatch} HGNC ids could not be associated in xrefs" in result_message
+    ), f"{prefix}Expected '{expected_mismatch} HGNC ids could not be associated in xrefs' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_hgnc_no_source_id(hgnc_parser: HGNCParser, test_no_source_id: Callable[[HGNCParser, int], None]) -> None:
+    test_no_source_id(hgnc_parser, SPECIES_ID_HUMAN)
+
+def test_hgnc_no_species_id(hgnc_parser: HGNCParser, test_no_species_id: Callable[[HGNCParser, int], None]) -> None:
+    test_no_species_id(hgnc_parser, SOURCE_ID_HGNC)
+
+def test_hgnc_no_file(hgnc_parser: HGNCParser, test_no_file: Callable[[HGNCParser, int, int], None]) -> None:
+    test_no_file(hgnc_parser, SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when no CCDS database is provided
+def test_no_ccds_db(hgnc_parser: HGNCParser) -> None:
+    with pytest.raises(
+        AttributeError, match="No ensembl ccds database provided"
+    ):
+        hgnc_parser.run(
+            {
+                "source_id": SOURCE_ID_HGNC,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file is not found
+def test_hgnc_file_not_found(hgnc_parser: HGNCParser, test_file_not_found: Callable[[HGNCParser, int, int], None]) -> None:
+    hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url")
+    test_file_not_found(hgnc_parser, SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_hgnc_empty_file(hgnc_parser: HGNCParser, test_empty_file: Callable[[HGNCParser, str, int, int], None]) -> None:
+    hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url")
+    test_empty_file(hgnc_parser, 'HGNC', SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
+
+# Test case to check successful parsing of valid HGNC data without existing ccds, refseq, or entrezgene xrefs
+def test_successful_parsing_without_existing_xrefs(mock_xref_dbi: DBConnection, hgnc_parser: HGNCParser) -> None:
+    # Mock all needed methods
+    hgnc_parser.get_source_name_for_source_id = MagicMock(return_value="HGNC")
+    hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+    hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url")
+    hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={})
+    hgnc_parser.get_valid_codes = MagicMock(return_value={})
+    hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={})
+
+    # Run and validate parsing for HGNC file
+    expected_counts = {"ccds": 0, "entrezgene_manual": 0, "refseq_manual": 0, "ensembl_manual": 19, "lrg": 2, "genecards": 19}
+    run_and_validate_parsing(hgnc_parser, mock_xref_dbi, expected_counts, 1, 78)
+
+    # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENSEMBL_MANUAL}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_LRG}")
+    check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_GENECARDS}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_DESC_ONLY}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 21)
+    check_row_count(mock_xref_dbi, "dependent_xref", 19)
+    check_row_count(mock_xref_dbi, "synonym", 78)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "HGNC:5", "ENSG00000121410")
+
+# Test case to check successful parsing of valid HGNC data with existing ccds, refseq, and entrezgene xrefs
+def test_successful_parsing_with_existing_xrefs(mock_xref_dbi: DBConnection, hgnc_parser: HGNCParser) -> None:
+    # Mock all needed methods
+    hgnc_parser.get_source_name_for_source_id = MagicMock(return_value="HGNC")
+    hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+    hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url")
+    hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={"CCDS12976": "CCDS12976", "CCDS8856": "CCDS8856", "CCDS53797": "CCDS53797"})
+    hgnc_parser.get_valid_codes = MagicMock(return_value={"NM_130786": [12], "NR_026971": [34, 56], "NR_015380": [78], "NM_001088": [90]})
+    hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={"503538": 123, "441376": 456, "51146": 789})
+
+    # Run and validate parsing for HGNC file
+    expected_counts = {"ccds": 3, "entrezgene_manual": 3, "refseq_manual": 5, "ensembl_manual": 19, "lrg": 2, "genecards": 19}
+    run_and_validate_parsing(hgnc_parser, mock_xref_dbi, expected_counts, 1, 90)
+
+    # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}")
+    check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENSEMBL_MANUAL}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_LRG}")
+    check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_GENECARDS}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_REFSEQ}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_DESC_ONLY}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 24)
+    check_row_count(mock_xref_dbi, "dependent_xref", 27)
+    check_row_count(mock_xref_dbi, "synonym", 90)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "HGNC:13666", "CCDS8856")
+    check_direct_xref_link(mock_xref_dbi, "gene", "HGNC:20", "LRG_359")
+
+    # Check the link between an xref and dependent_xref
+    check_dependent_xref_link(mock_xref_dbi, "HGNC:5", 12)
+    check_dependent_xref_link(mock_xref_dbi, "HGNC:27057", 56)
+    check_dependent_xref_link(mock_xref_dbi, "HGNC:17968", 789)
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "HGNC:8", SOURCE_ID_ENSEMBL_MANUAL, "A2MP")
+    check_synonym(mock_xref_dbi, "HGNC:37133", SOURCE_ID_ENTREZGENE, "FLJ23569")
+    check_synonym(mock_xref_dbi, "HGNC:37133", SOURCE_ID_REFSEQ, "FLJ23569")
+
+    # Run and validate re-parsing for HGNC file
+    run_and_validate_parsing(hgnc_parser, mock_xref_dbi, expected_counts, 1, 90, "Re-parsing: ")
+
+    # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_CCDS}")
+    check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENSEMBL_MANUAL}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_LRG}")
+    check_row_count(mock_xref_dbi, "xref", 19, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_GENECARDS}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_REFSEQ}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_DESC_ONLY}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 24)
+    check_row_count(mock_xref_dbi, "dependent_xref", 27)
+    check_row_count(mock_xref_dbi, "synonym", 90)
+
diff --git a/src/python/test/xrefs/parsers/test_hpa_parser.py b/src/python/test/xrefs/parsers/test_hpa_parser.py
new file mode 100644
index 000000000..838a3756e
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_hpa_parser.py
@@ -0,0 +1,132 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.HPAParser import HPAParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link
+
+# Constants
+SOURCE_ID_HPA = 1
+SPECIES_ID_HUMAN = 9606
+EXPECTED_NUMBER_OF_COLUMNS = 4
+
+# Fixture to create an HPAParser instance
+@pytest.fixture
+def hpa_parser() -> HPAParser:
+    return HPAParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(hpa_parser: HPAParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = hpa_parser.run(
+        {
+            "source_id": SOURCE_ID_HPA,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "parsers/flatfiles/hpa.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing HPA data"
+    assert (
+        f"{expected_xrefs} direct xrefs successfully parsed" in result_message
+    ), f"{prefix}Expected '{expected_xrefs} direct xrefs successfully parsed' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_hpa_no_source_id(hpa_parser: HPAParser, test_no_source_id: Callable[[HPAParser, int], None]) -> None:
+    test_no_source_id(hpa_parser, SPECIES_ID_HUMAN)
+
+def test_hpa_no_species_id(hpa_parser: HPAParser, test_no_species_id: Callable[[HPAParser, int], None]) -> None:
+    test_no_species_id(hpa_parser, SOURCE_ID_HPA)
+
+def test_hpa_no_file(hpa_parser: HPAParser, test_no_file: Callable[[HPAParser, int, int], None]) -> None:
+    test_no_file(hpa_parser, SOURCE_ID_HPA, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_hpa_file_not_found(hpa_parser: HPAParser, test_file_not_found: Callable[[HPAParser, int, int], None]) -> None:
+    test_file_not_found(hpa_parser, SOURCE_ID_HPA, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_hpa_empty_file(hpa_parser: HPAParser, test_empty_file: Callable[[HPAParser, str, int, int], None]) -> None:
+    test_empty_file(hpa_parser, 'HPA', SOURCE_ID_HPA, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the header has insufficient columns
+def test_insufficient_header_columns(hpa_parser: HPAParser) -> None:
+    mock_file = io.StringIO("antibody,antibody_id\n")
+    hpa_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in HPA file"):
+        hpa_parser.run(
+            {
+                "source_id": SOURCE_ID_HPA,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Parametrized test case to check if an error is raised for various malformed headers
+@pytest.mark.parametrize(
+    "header", [
+        ("Antibodies,antibody_id,ensembl_peptide_id,link\n"),
+        ("Antibody,antibodyId,ensembl_peptide_id,link\n"),
+        ("Antibody,antibody_id,ensembl peptide id,link\n"),
+        ("Antibody,antibody_id,ensembl_peptide_id,links\n")
+    ],
+    ids=["antibody column", "antibody_id column", "ensembl_id column", "link column"],
+)
+def test_malformed_headers(hpa_parser: HPAParser, header: str) -> None:
+    mock_file = io.StringIO(header)
+    hpa_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in HPA file"):
+        hpa_parser.run(
+            {
+                "source_id": SOURCE_ID_HPA,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file has insufficient columns
+def test_insufficient_columns(hpa_parser: HPAParser) -> None:
+    mock_file = io.StringIO()
+    mock_file.write("Antibody,antibody_id,ensembl_peptide_id,link\n")
+    mock_file.write("CAB000001,1,ENSP00000363822\n")
+    mock_file.seek(0)
+
+    hpa_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="has an incorrect number of columns"):
+        hpa_parser.run(
+            {
+                "source_id": SOURCE_ID_HPA,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing of valid HPA data
+def test_successful_parsing(mock_xref_dbi: DBConnection, hpa_parser: HPAParser) -> None:
+    # Run and validate parsing for HPA file
+    run_and_validate_parsing(hpa_parser, mock_xref_dbi, 10)
+
+    # Check the row counts in the xref and direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_HPA}")
+    check_row_count(mock_xref_dbi, "translation_direct_xref", 10)
+
+    # Check the link between an xref and translation_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "translation", "2", "ENSP00000224784")
+
+    # Run and validate re-parsing of the HPA file
+    run_and_validate_parsing(hpa_parser, mock_xref_dbi, 10, "Re-parsing: ")
+
+    # Re-check the row counts in the xref and direct_xref tables after re-parsing
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DIRECT' AND source_id={SOURCE_ID_HPA}")
+    check_row_count(mock_xref_dbi, "translation_direct_xref", 10)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_jgi_protein_parser.py b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py
new file mode 100644
index 000000000..666e6fa95
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py
@@ -0,0 +1,61 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.JGI_ProteinParser import JGI_ProteinParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count
+
+# Constants
+SOURCE_ID_JGI = 1
+SPECIES_ID_C_INTESTINALIS = 7719
+
+# Fixture to create a JGI_ProteinParser instance
+@pytest.fixture
+def jgi_protein_parser() -> JGI_ProteinParser:
+    return JGI_ProteinParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(jgi_protein_parser: JGI_ProteinParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = jgi_protein_parser.run(
+        {
+            "source_id": SOURCE_ID_JGI,
+            "species_id": SPECIES_ID_C_INTESTINALIS,
+            "file": "parsers/flatfiles/jgi_protein.fasta",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing JGI data"
+    assert f"{expected_xrefs} JGI_ xrefs successfully parsed" in result_message, f"{prefix}Expected '{expected_xrefs} JGI_ xrefs successfully parsed' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_jgi_no_source_id(jgi_protein_parser: JGI_ProteinParser, test_no_source_id: Callable[[JGI_ProteinParser, int], None]) -> None:
+    test_no_source_id(jgi_protein_parser, SPECIES_ID_C_INTESTINALIS)
+
+def test_jgi_no_species_id(jgi_protein_parser: JGI_ProteinParser, test_no_species_id: Callable[[JGI_ProteinParser, int], None]) -> None:
+    test_no_species_id(jgi_protein_parser, SOURCE_ID_JGI)
+
+def test_jgi_no_file(jgi_protein_parser: JGI_ProteinParser, test_no_file: Callable[[JGI_ProteinParser, int, int], None]) -> None:
+    test_no_file(jgi_protein_parser, SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
+
+# Test case to check if an error is raised when the file is not found
+def test_jgi_file_not_found(jgi_protein_parser: JGI_ProteinParser, test_file_not_found: Callable[[JGI_ProteinParser, int, int], None]) -> None:
+    test_file_not_found(jgi_protein_parser, SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
+
+# Test case to check if an error is raised when the file is empty
+def test_jgi_empty_file(jgi_protein_parser: JGI_ProteinParser, test_empty_file: Callable[[JGI_ProteinParser, str, int, int], None]) -> None:
+    test_empty_file(jgi_protein_parser, 'JGIProtein', SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
+
+# Test case to check successful parsing
+def test_successful_parsing(mock_xref_dbi: DBConnection, jgi_protein_parser: JGI_ProteinParser) -> None:
+    # Run and validate parsing for JGI Protein file
+    run_and_validate_parsing(jgi_protein_parser, mock_xref_dbi, 9)
+
+    # Check the row counts in the xref and primary_xref tables
+    check_row_count(mock_xref_dbi, "xref", 9, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_JGI}")
+    check_row_count(mock_xref_dbi, "primary_xref", 9)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_mgi_desc_parser.py b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py
new file mode 100644
index 000000000..02b46352b
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py
@@ -0,0 +1,148 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.MGIDescParser import MGIDescParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_synonym
+
+# Constants
+SOURCE_ID_MGI_DESC = 1
+SPECIES_ID_MOUSE = 10090
+EXPECTED_NUMBER_OF_COLUMNS = 12
+
+# Fixture to create an MGIDescParser instance
+@pytest.fixture
+def mgi_desc_parser() -> MGIDescParser:
+    return MGIDescParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(mgi_desc_parser: MGIDescParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = mgi_desc_parser.run(
+        {
+            "source_id": SOURCE_ID_MGI_DESC,
+            "species_id": SPECIES_ID_MOUSE,
+            "file": "parsers/flatfiles/mgi_desc.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing MGI Description data"
+    assert (
+        f"{expected_xrefs} MGI Description Xrefs added" in result_message
+    ), f"{prefix}Expected '{expected_xrefs} MGI Description Xrefs added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_synonyms} synonyms added" in result_message
+    ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_mgi_desc_no_source_id(mgi_desc_parser: MGIDescParser, test_no_source_id: Callable[[MGIDescParser, int], None]) -> None:
+    test_no_source_id(mgi_desc_parser, SPECIES_ID_MOUSE)
+
+def test_mgi_desc_no_species_id(mgi_desc_parser: MGIDescParser, test_no_species_id: Callable[[MGIDescParser, int], None]) -> None:
+    test_no_species_id(mgi_desc_parser, SOURCE_ID_MGI_DESC)
+
+def test_mgi_desc_no_file(mgi_desc_parser: MGIDescParser, test_no_file: Callable[[MGIDescParser, int, int], None]) -> None:
+    test_no_file(mgi_desc_parser, SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
+
+# Test case to check if an error is raised when the file is not found
+def test_mgi_desc_file_not_found(mgi_desc_parser: MGIDescParser, test_file_not_found: Callable[[MGIDescParser, int, int], None]) -> None:
+    test_file_not_found(mgi_desc_parser, SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
+
+# Test case to check if an error is raised when the file is empty
+def test_mgi_desc_empty_file(mgi_desc_parser: MGIDescParser, test_empty_file: Callable[[MGIDescParser, str, int, int], None]) -> None:
+    test_empty_file(mgi_desc_parser, 'MGI_desc', SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
+
+# Test case to check if an error is raised when the header has insufficient columns
+def test_insufficient_header_columns(mgi_desc_parser: MGIDescParser) -> None:
+    mock_file = io.StringIO("mgi accession id\tchr\tcm position\n")
+    mgi_desc_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in MGI_desc file"):
+        mgi_desc_parser.run(
+            {
+                "source_id": SOURCE_ID_MGI_DESC,
+                "species_id": SPECIES_ID_MOUSE,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Parametrized test case to check if an error is raised for various malformed headers
+@pytest.mark.parametrize(
+    "header", [
+        ("MGI_accession_ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChromosome\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Pos\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coord start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coord end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tchr strand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tSymbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tMarker Status\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tName\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker_Type\tFeature Type\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Types\tMarker Synonyms (pipe-separated)\n"),
+        ("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms\n"),
+    ],
+    ids=[
+        "accession column", "chromosome column", "position column", "coord start column",
+        "coord end column", "strand column", "symbol column", "status column", "name column",
+        "marker type column", "feature type column", "synonyms column"
+    ],
+)
+def test_malformed_headers(mgi_desc_parser: MGIDescParser, header: str) -> None:
+    mock_file = io.StringIO(header)
+    mgi_desc_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in MGI_desc file"):
+        mgi_desc_parser.run(
+            {
+                "source_id": SOURCE_ID_MGI_DESC,
+                "species_id": SPECIES_ID_MOUSE,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file has insufficient columns
+def test_insufficient_columns(mgi_desc_parser: MGIDescParser) -> None:
+    mock_file = io.StringIO()
+    mock_file.write("MGI Accession ID\tChr\tcM Position\tgenome coordinate start\tgenome coordinate end\tstrand\tMarker Symbol\tStatus\tMarker Name\tMarker Type\tFeature Type\tMarker Synonyms (pipe-separated)\n")
+    mock_file.write("MGI:1341858\t5\tsyntenic\n")
+    mock_file.seek(0)
+
+    mgi_desc_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="has an incorrect number of columns"):
+        mgi_desc_parser.run(
+            {
+                "source_id": SOURCE_ID_MGI_DESC,
+                "species_id": SPECIES_ID_MOUSE,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing of valid MGI Description data
+def test_successful_parsing(mock_xref_dbi: DBConnection, mgi_desc_parser: MGIDescParser) -> None:
+    # Run and validate parsing for MGI Description file
+    run_and_validate_parsing(mgi_desc_parser, mock_xref_dbi, 10, 2)
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='MISC' AND source_id={SOURCE_ID_MGI_DESC}")
+    check_row_count(mock_xref_dbi, "synonym", 2)
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "MGI:1926146", SOURCE_ID_MGI_DESC, "Ecrg4")
+
+    # Run and validate re-parsing for MGI Description file
+    run_and_validate_parsing(mgi_desc_parser, mock_xref_dbi, 10, 2, "Re-parsing: ")
+
+    # Check the row counts in the xref and synonym tables again
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='MISC' AND source_id={SOURCE_ID_MGI_DESC}")
+    check_row_count(mock_xref_dbi, "synonym", 2)
+
diff --git a/src/python/test/xrefs/parsers/test_mgi_parser.py b/src/python/test/xrefs/parsers/test_mgi_parser.py
new file mode 100644
index 000000000..fab933d60
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_mgi_parser.py
@@ -0,0 +1,84 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.MGIParser import MGIParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_synonym, check_direct_xref_link
+
+# Constants
+SOURCE_ID_MGI = 1
+SPECIES_ID_MOUSE = 10090
+
+# Fixture to create an MGIParser instance
+@pytest.fixture
+def mgi_parser() -> MGIParser:
+    return MGIParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(mgi_parser: MGIParser, mock_xref_dbi: DBConnection, expected_direct_xrefs: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = mgi_parser.run(
+        {
+            "source_id": SOURCE_ID_MGI,
+            "species_id": SPECIES_ID_MOUSE,
+            "file": "parsers/flatfiles/mgi.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing MGI data"
+    assert (
+        f"{expected_direct_xrefs} direct MGI xrefs added" in result_message
+    ), f"{prefix}Expected '{expected_direct_xrefs} direct MGI xrefs added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_synonyms} synonyms added" in result_message
+    ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_mgi_no_source_id(mgi_parser: MGIParser, test_no_source_id: Callable[[MGIParser, int], None]) -> None:
+    test_no_source_id(mgi_parser, SPECIES_ID_MOUSE)
+
+def test_mgi_no_species_id(mgi_parser: MGIParser, test_no_species_id: Callable[[MGIParser, int], None]) -> None:
+    test_no_species_id(mgi_parser, SOURCE_ID_MGI)
+
+def test_mgi_no_file(mgi_parser: MGIParser, test_no_file: Callable[[MGIParser, int, int], None]) -> None:
+    test_no_file(mgi_parser, SOURCE_ID_MGI, SPECIES_ID_MOUSE)
+
+# Test case to check if an error is raised when the file is not found
+def test_mgi_file_not_found(mgi_parser: MGIParser, test_file_not_found: Callable[[MGIParser, int, int], None]) -> None:
+    test_file_not_found(mgi_parser, SOURCE_ID_MGI, SPECIES_ID_MOUSE)
+
+# Test case to check if an error is raised when the file is empty
+def test_mgi_empty_file(mgi_parser: MGIParser, test_empty_file: Callable[[MGIParser, str, int, int], None]) -> None:
+    test_empty_file(mgi_parser, 'MGI', SOURCE_ID_MGI, SPECIES_ID_MOUSE)
+
+# Test case to check successful parsing of valid MGI data
+def test_successful_parsing(mock_xref_dbi: DBConnection, mgi_parser: MGIParser) -> None:
+    # Mock the synonym hash to return some test synonyms
+    mgi_parser.get_ext_synonyms = MagicMock(return_value={"MGI:1926146": ["Ecrg4", "augurin"]})
+
+    # Run and validate parsing for MGI file
+    run_and_validate_parsing(mgi_parser, mock_xref_dbi, 10, 2)
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DIRECT' AND source_id={SOURCE_ID_MGI}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 10)
+    check_row_count(mock_xref_dbi, "synonym", 2)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "MGI:1914753", "ENSMUSG00000103746")
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "MGI:1926146", SOURCE_ID_MGI, "Ecrg4")
+    check_synonym(mock_xref_dbi, "MGI:1926146", SOURCE_ID_MGI, "augurin")
+
+    # Run and validate re-parsing for MGI file
+    run_and_validate_parsing(mgi_parser, mock_xref_dbi, 10, 2, "Re-parsing: ")
+
+    # Check the row counts in the xref and synonym tables again
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DIRECT' AND source_id={SOURCE_ID_MGI}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 10)
+    check_row_count(mock_xref_dbi, "synonym", 2)
diff --git a/src/python/test/xrefs/parsers/test_mim2gene_parser.py b/src/python/test/xrefs/parsers/test_mim2gene_parser.py
new file mode 100644
index 000000000..590c1c3bc
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_mim2gene_parser.py
@@ -0,0 +1,250 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+from sqlalchemy import text
+
+from ensembl.production.xrefs.parsers.Mim2GeneParser import Mim2GeneParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_dependent_xref_link
+
+# Constants
+SOURCE_ID_MIM2GENE = 1
+SOURCE_ID_MIM_GENE = 2
+SOURCE_ID_MIM_MORBID = 3
+SOURCE_ID_ENTREZGENE = 4
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create a Mim2GeneParser instance
+@pytest.fixture
+def mim2gene_parser() -> Mim2GeneParser:
+    return Mim2GeneParser(True)
+
+# Mock for get_source_id_for_source_name
+def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection) -> int:
+    source_mapping = {
+        "MIM_GENE": SOURCE_ID_MIM_GENE,
+        "MIM_MORBID": SOURCE_ID_MIM_MORBID,
+        "EntrezGene": SOURCE_ID_ENTREZGENE,
+    }
+    return source_mapping.get(source_name, SOURCE_ID_MIM2GENE)
+
+# Function to populate the database with MIM and EntrezGene xrefs
+def populate_xref_db(mock_xref_dbi: DBConnection):
+    source_data = [
+        [SOURCE_ID_MIM2GENE, 'MIM2GENE', 10],
+        [SOURCE_ID_MIM_GENE, 'MIM_GENE', 10],
+        [SOURCE_ID_MIM_MORBID, 'MIM_MORBID', 10],
+        [SOURCE_ID_ENTREZGENE, 'EntrezGene', 10],
+    ]
+    for row in source_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO source (source_id, name, ordered)
+                VALUES (:source_id, :name, :ordered)
+                """
+            ),
+            {
+                "source_id": row[0],
+                "name": row[1],
+                "ordered": row[2],
+            }
+        )
+
+    xref_data = [
+        [1, '100050', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # unmapped
+        [2, '100640', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # dependent
+        [3, '100100', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # dependent
+        [4, '142830', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # unmapped
+        [5, '142830', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # unmapped
+        [6, '100660', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # dependent
+        [7, '100300', SOURCE_ID_MIM_MORBID, SPECIES_ID_HUMAN, 'UNMAPPED'], # via synonym
+        [8, '999999', SOURCE_ID_MIM_GENE, SPECIES_ID_HUMAN, 'UNMAPPED'], # not referenced
+        [9, '216', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # <- 100640
+        [10, '1131', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # <- 100100 
+        [11, '218', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # 100660
+        [12, '222222', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DIRECT'], # not referenced <- via synonym
+    ]
+    for row in xref_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO xref (xref_id, accession, source_id, species_id, info_type)
+                VALUES (:xref_id, :accession, :source_id, :species_id, :info_type)
+                """
+            ),
+            {
+                "xref_id": row[0],
+                "accession": row[1],
+                "source_id": row[2],
+                "species_id": row[3],
+                "info_type": row[4],
+            }
+        )
+
+    mock_xref_dbi.commit()
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(mim2gene_parser: Mim2GeneParser, mock_xref_dbi: DBConnection, expected_entries: int, expected_missed_omim: int, expected_entrez: int, expected_missed_master: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = mim2gene_parser.run(
+        {
+            "source_id": SOURCE_ID_MIM2GENE,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "parsers/flatfiles/mim2gene.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing Mim2Gene data"
+    assert (
+        f"Processed {expected_entries} entries" in result_message
+    ), f"{prefix}Expected 'Processed {expected_entries} entries' in result message, but got: '{result_message}'"
+    assert (
+        f"{expected_missed_omim} had missing OMIM entries" in result_message
+    ), f"{prefix}Expected '{expected_missed_omim} had missing OMIM entries' in result message, but got: '{result_message}'"
+    assert (
+        f"{expected_entrez} were dependent EntrezGene xrefs" in result_message
+    ), f"{prefix}Expected '{expected_entrez} were dependent EntrezGene xrefs' in result message, but got: '{result_message}'"
+    assert (
+        f"{expected_missed_master} had missing master entries" in result_message
+    ), f"{prefix}Expected '{expected_missed_master} had missing master entries' in result message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_mim2gene_no_source_id(mim2gene_parser: Mim2GeneParser, test_no_source_id: Callable[[Mim2GeneParser, int], None]) -> None:
+    test_no_source_id(mim2gene_parser, SPECIES_ID_HUMAN)
+
+def test_mim2gene_no_species_id(mim2gene_parser: Mim2GeneParser, test_no_species_id: Callable[[Mim2GeneParser, int], None]) -> None:
+    test_no_species_id(mim2gene_parser, SOURCE_ID_MIM2GENE)
+
+def test_mim2gene_no_file(mim2gene_parser: Mim2GeneParser, test_no_file: Callable[[Mim2GeneParser, int, int], None]) -> None:
+    test_no_file(mim2gene_parser, SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_mim2gene_file_not_found(mim2gene_parser: Mim2GeneParser, test_file_not_found: Callable[[Mim2GeneParser, int, int], None]) -> None:
+    test_file_not_found(mim2gene_parser, SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_mim2gene_empty_file(mim2gene_parser: Mim2GeneParser, test_empty_file: Callable[[Mim2GeneParser, str, int, int], None]) -> None:
+    test_empty_file(mim2gene_parser, 'Mim2Gene', SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_mim2gene_missing_required_source_id(mim2gene_parser: Mim2GeneParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[Mim2GeneParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(mim2gene_parser, mock_xref_dbi, 'MIM_GENE', SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the header has insufficient columns
+def test_insufficient_header_columns(mim2gene_parser: Mim2GeneParser) -> None:
+    mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    mock_file = io.StringIO("#MIM number\tGeneID\ttype\tSource\tMedGenCUI\n")
+    mim2gene_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in Mim2Gene file"):
+        mim2gene_parser.run(
+            {
+                "source_id": SOURCE_ID_MIM2GENE,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Parametrized test case to check if an error is raised for various malformed headers
+@pytest.mark.parametrize(
+    "header", [
+        ("#MIM\tGeneID\ttype\tSource\tMedGenCUI\tComment\n"),
+        ("#MIM number\tGene_ID\ttype\tSource\tMedGenCUI\tComment\n"),
+        ("#MIM number\tGeneID\tTYPE\tSource\tMedGenCUI\tComment\n"),
+        ("#MIM number\tGeneID\ttype\tsource\tMedGenCUI\tComment\n"),
+        ("#MIM number\tGeneID\ttype\tSource\tMedGen\tComment\n"),
+        ("#MIM number\tGeneID\ttype\tSource\tMedGenCUI\tComments\n"),
+    ],
+    ids=["mim_number column", "gene_id column", "type column", "source column", "medgen_cui column", "comment column"],
+)
+def test_malformed_headers(mim2gene_parser: Mim2GeneParser, header: str) -> None:
+    mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    mock_file = io.StringIO(header)
+    mim2gene_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Malformed or unexpected header in Mim2Gene file"):
+        mim2gene_parser.run(
+            {
+                "source_id": SOURCE_ID_MIM2GENE,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file has insufficient columns
+def test_insufficient_columns(mim2gene_parser: Mim2GeneParser) -> None:
+    mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    mock_file = io.StringIO()
+    mock_file.write("#MIM number\tGeneID\ttype\tSource\tMedGenCUI\tComment\n")
+    mock_file.write("100050\t-\tphenotype\t-\n")
+    mock_file.seek(0)
+
+    mim2gene_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="has an incorrect number of columns"):
+        mim2gene_parser.run(
+            {
+                "source_id": SOURCE_ID_MIM2GENE,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing of valid Mim2Gene data without existing mim or entrezgene xrefs
+def test_successful_parsing_without_existing_xrefs(mock_xref_dbi: DBConnection, mim2gene_parser: Mim2GeneParser) -> None:
+    mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    # Run and validate parsing for Mim2Gene file
+    run_and_validate_parsing(mim2gene_parser, mock_xref_dbi, 9, 9, 0, 0)
+
+    # Check that no xrefs were added
+    check_row_count(mock_xref_dbi, "xref", 0)
+
+# Test case to check successful parsing of valid Mim2Gene data with existing mim and entrezgene xrefs
+def test_successful_parsing_with_existing_xrefs(mock_xref_dbi: DBConnection, mim2gene_parser: Mim2GeneParser) -> None:
+    mim2gene_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+    populate_xref_db(mock_xref_dbi)
+
+    # Check the row counts in the xref and dependent_xref tables before running the parser
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "dependent_xref", 0)
+
+    # Run and validate parsing for Mim2Gene file
+    run_and_validate_parsing(mim2gene_parser, mock_xref_dbi, 9, 4, 3, 2)
+
+    # Check the row counts in the xref and dependent_xref tables
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "dependent_xref", 3)
+
+    # Check the link between an xref and dependent_xref
+    check_dependent_xref_link(mock_xref_dbi, "100640", 9)
+    check_dependent_xref_link(mock_xref_dbi, "100100", 10)
+
+    # Run and validate re-parsing for Mim2Gene file
+    run_and_validate_parsing(mim2gene_parser, mock_xref_dbi, 9, 4, 3, 2, "Re-parsing: ")
+
+    # Check the row counts in the xref and dependent_xref tables
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "dependent_xref", 3)
diff --git a/src/python/test/xrefs/parsers/test_mim_parser.py b/src/python/test/xrefs/parsers/test_mim_parser.py
new file mode 100644
index 000000000..676c182bf
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_mim_parser.py
@@ -0,0 +1,126 @@
+import pytest
+from unittest.mock import MagicMock, patch
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.MIMParser import MIMParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_synonym
+
+# Constants
+SOURCE_ID_MIM = 1
+SOURCE_ID_MIM_GENE = 2
+SOURCE_ID_MIM_MORBID = 3
+SPECIES_ID_HUMAN = 9606
+
+# Fixture to create a MIMParser instance
+@pytest.fixture
+def mim_parser() -> MIMParser:
+    return MIMParser(True)
+
+# Mock for get_source_id_for_source_name
+def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection) -> int:
+    if source_name == "MIM_GENE":
+        return SOURCE_ID_MIM_GENE
+    elif source_name == "MIM_MORBID":
+        return SOURCE_ID_MIM_MORBID
+    else:
+        return SOURCE_ID_MIM
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(mim_parser: MIMParser, mock_xref_dbi: DBConnection, expected_genemap_xrefs: int, expected_phenotype_xrefs: int, expected_synonyms: int, expected_removed_entries: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = mim_parser.run(
+        {
+            "source_id": SOURCE_ID_MIM,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "parsers/flatfiles/mim.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing MIM data"
+    assert (
+        f"{expected_genemap_xrefs} genemap and {expected_phenotype_xrefs} phenotype MIM xrefs added" in result_message
+    ), f"{prefix}Expected '{expected_genemap_xrefs} genemap and {expected_phenotype_xrefs} phenotype MIM xrefs added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_synonyms} synonyms (defined by MOVED TO) added" in result_message
+    ), f"{prefix}Expected '{expected_synonyms} synonyms (defined by MOVED TO) added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_removed_entries} entries removed" in result_message
+    ), f"{prefix}Expected '{expected_removed_entries} entries removed' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_mim_no_source_id(mim_parser: MIMParser, test_no_source_id: Callable[[MIMParser, int], None]) -> None:
+    test_no_source_id(mim_parser, SPECIES_ID_HUMAN)
+
+def test_mim_no_species_id(mim_parser: MIMParser, test_no_species_id: Callable[[MIMParser, int], None]) -> None:
+    test_no_species_id(mim_parser, SOURCE_ID_MIM)
+
+def test_mim_no_file(mim_parser: MIMParser, test_no_file: Callable[[MIMParser, int, int], None]) -> None:
+    test_no_file(mim_parser, SOURCE_ID_MIM, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_mim_file_not_found(mim_parser: MIMParser, test_file_not_found: Callable[[MIMParser, int, int], None]) -> None:
+    test_file_not_found(mim_parser, SOURCE_ID_MIM, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the TI field is missing
+def test_missing_ti_field(mim_parser: MIMParser) -> None:
+    mock_file_content = [
+        "*RECORD*\n*FIELD*\nNO\n100050\n"
+    ]
+    mim_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    with patch.object(MIMParser, 'get_file_sections', return_value=mock_file_content):
+        with pytest.raises(ValueError, match="Failed to extract TI field from record"):
+            mim_parser.run(
+                {
+                    "source_id": SOURCE_ID_MIM,
+                    "species_id": SPECIES_ID_HUMAN,
+                    "file": "dummy_file.txt",
+                    "xref_dbi": MagicMock(),
+                }
+            )
+
+# Test case to check if an error is raised when the TI field has an invalid format
+def test_invalid_ti_field(mim_parser: MIMParser) -> None:
+    mock_file_content = [
+        "*RECORD*\n*FIELD*\nNO\n100050\n*FIELD*\nTI\nAARSKOG SYNDROME, AUTOSOMAL DOMINANT\n*FIELD*\nTX\n\nDESCRIPTION\n\nAarskog syndrome is characterized by short stature and facial, limb,\n\n*THEEND*\n"
+    ]
+    mim_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    with patch.object(MIMParser, 'get_file_sections', return_value=mock_file_content):
+        with pytest.raises(ValueError, match="Failed to extract record type and description from TI field"):
+            mim_parser.run(
+                {
+                    "source_id": SOURCE_ID_MIM,
+                    "species_id": SPECIES_ID_HUMAN,
+                    "file": "dummy_file.txt",
+                    "xref_dbi": MagicMock(),
+                }
+            )
+
+# Test case to check successful parsing of valid MIM data
+def test_successful_parsing(mock_xref_dbi: DBConnection, mim_parser: MIMParser) -> None:
+    mim_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    # Run and validate parsing for MIM file
+    run_and_validate_parsing(mim_parser, mock_xref_dbi, 2, 4, 2, 1)
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "synonym", 4)
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "200150", SOURCE_ID_MIM_GENE, "100500")
+    check_synonym(mock_xref_dbi, "200150", SOURCE_ID_MIM_MORBID, "100650")
+
+    # Check for re-parsing of the same file
+    run_and_validate_parsing(mim_parser, mock_xref_dbi, 2, 4, 2, 1, "Re-parsing: ")
+
+    # Re-check the row counts in the xref and synonym tables after re-parsing
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='UNMAPPED' AND source_id={SOURCE_ID_MIM_MORBID}")
+    check_row_count(mock_xref_dbi, "synonym", 4)
diff --git a/src/python/test/xrefs/parsers/test_mirbase_parser.py b/src/python/test/xrefs/parsers/test_mirbase_parser.py
new file mode 100644
index 000000000..f9c426c3a
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_mirbase_parser.py
@@ -0,0 +1,111 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.miRBaseParser import miRBaseParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_sequence
+
+# Constants
+SOURCE_ID_MIRBASE = 1
+SPECIES_ID_C_ELEGANS = 6239
+SPECIES_NAME_C_ELEGANS = "caenorhabditis_elegans"
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create a miRBaseParser instance
+@pytest.fixture
+def mirbase_parser() -> miRBaseParser:
+    return miRBaseParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(mirbase_parser: miRBaseParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = mirbase_parser.run(
+        {
+            "source_id": SOURCE_ID_MIRBASE,
+            "species_id": SPECIES_ID_C_ELEGANS,
+            "species_name": SPECIES_NAME_C_ELEGANS,
+            "file": "parsers/flatfiles/mirbase.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing miRBase data"
+    assert (
+        f"Read {expected_xrefs} xrefs from" in result_message
+    ), f"{prefix}Expected 'Read {expected_xrefs} xrefs from' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_mirbase_no_source_id(mirbase_parser: miRBaseParser, test_no_source_id: Callable[[miRBaseParser, int], None]) -> None:
+    test_no_source_id(mirbase_parser, SPECIES_ID_C_ELEGANS)
+
+def test_mirbase_no_species_id(mirbase_parser: miRBaseParser, test_no_species_id: Callable[[miRBaseParser, int], None]) -> None:
+    test_no_species_id(mirbase_parser, SOURCE_ID_MIRBASE)
+
+def test_mirbase_no_file(mirbase_parser: miRBaseParser, test_no_file: Callable[[miRBaseParser, int, int], None]) -> None:
+    test_no_file(mirbase_parser, SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS)
+
+# Test case to check if an error is raised when the file is not found
+def test_mirbase_file_not_found(mirbase_parser: miRBaseParser, test_file_not_found: Callable[[miRBaseParser, int, int], None]) -> None:
+    mirbase_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_C_ELEGANS: [SPECIES_NAME_C_ELEGANS]})
+    test_file_not_found(mirbase_parser, SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS)
+
+# Test case to check if parsing is skipped when no species name can be found
+def test_no_species_name(mock_xref_dbi: DBConnection, mirbase_parser: miRBaseParser) -> None:
+    mirbase_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
+
+    result_code, result_message = mirbase_parser.run(
+        {
+            "source_id": SOURCE_ID_MIRBASE,
+            "species_id": SPECIES_ID_C_ELEGANS,
+            "file": "dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing miRBase data"
+    assert (
+        "Skipped. Could not find species ID to name mapping" in result_message
+    ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'"
+
+# Test case to check if no xrefs are added when the species name provided is not in the file
+def test_no_xrefs_added(mock_xref_dbi: DBConnection, mirbase_parser: miRBaseParser) -> None:
+    mirbase_parser.species_id_to_names = MagicMock(return_value={})
+
+    result_code, result_message = mirbase_parser.run(
+        {
+            "source_id": SOURCE_ID_MIRBASE,
+            "species_id": SPECIES_ID_HUMAN,
+            "species_name": SPECIES_NAME_HUMAN,
+            "file": f"parsers/flatfiles/mirbase.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing miRBase data"
+    assert "No xrefs added" in result_message, f"Expected 'No xrefs added' in result_message, but got: '{result_message}'"
+
+# Test case to check successful parsing of valid miRBase data
+def test_successful_parsing(mock_xref_dbi: DBConnection, mirbase_parser: miRBaseParser) -> None:
+    mirbase_parser.species_id_to_names = MagicMock(return_value={})
+
+    # Run and validate parsing for miRBase file
+    run_and_validate_parsing(mirbase_parser, mock_xref_dbi, 6)
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_MIRBASE}")
+    check_row_count(mock_xref_dbi, "primary_xref", 6)
+
+    # Check the sequences for specific accessions
+    check_sequence(mock_xref_dbi, "MI0000002", SOURCE_ID_MIRBASE, "ATGCTTCCGGCCTGTTCCCTGAGACCTCAAGTGTGAGTGTACTATTGATGCTTCACACCTGGGCTCTCCGGGTACCAGGACGGTTTGAGCAGAT")
+    check_sequence(mock_xref_dbi, "MI0000006", SOURCE_ID_MIRBASE, "TCTCGGATCAGATCGAGCCATTGCTGGTTTCTTCCACAGTGGTACTTTCCATTAGAACTATCACCGGGTGGAAACTAGCAGTGGCTCGATCTTTTCC")
+
+    # Run and validate parsing for miRBase file
+    run_and_validate_parsing(mirbase_parser, mock_xref_dbi, 6, "Re-parsing: ")
+
+    # Check the row counts in the xref and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_MIRBASE}")
+    check_row_count(mock_xref_dbi, "primary_xref", 6)
diff --git a/src/python/test/xrefs/parsers/test_reactome_parser.py b/src/python/test/xrefs/parsers/test_reactome_parser.py
new file mode 100644
index 000000000..9187fde0e
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_reactome_parser.py
@@ -0,0 +1,166 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+from sqlalchemy import text
+
+from ensembl.production.xrefs.parsers.ReactomeParser import ReactomeParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_release
+
+# Constants
+SOURCE_ID_REACTOME = 1
+SOURCE_ID_REACTOME_DIRECT = 2
+SOURCE_ID_REACTOME_UNIPROT = 3
+SOURCE_ID_REACTOME_GENE = 4
+SOURCE_ID_REACTOME_TRANSCRIPT = 5
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create a ReactomeParser instance
+@pytest.fixture
+def reactome_parser() -> ReactomeParser:
+    return ReactomeParser(True)
+
+# Function to populate the database with sources
+def populate_xref_db(mock_xref_dbi: DBConnection):
+    source_data = [
+        [SOURCE_ID_REACTOME, 'reactome', 10, ''],
+        [SOURCE_ID_REACTOME_TRANSCRIPT, 'reactome_transcript', 10, ''],
+        [SOURCE_ID_REACTOME_GENE, 'reactome_gene', 10, ''],
+        [SOURCE_ID_REACTOME_DIRECT, 'reactome', 10, 'direct'],
+        [SOURCE_ID_REACTOME_UNIPROT, 'reactome', 10, 'uniprot'],
+    ]
+    for row in source_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO source (source_id, name, ordered, priority_description)
+                VALUES (:source_id, :name, :ordered, :priority_description)
+                """
+            ),
+            {
+                "source_id": row[0],
+                "name": row[1],
+                "ordered": row[2],
+                "priority_description": row[3],
+            }
+        )
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(reactome_parser: ReactomeParser, mock_xref_dbi: DBConnection, file: str, expected_processed: int, expected_dependent: int, expected_direct: int, expected_errors: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = reactome_parser.run(
+        {
+            "source_id": SOURCE_ID_REACTOME,
+            "species_id": SPECIES_ID_HUMAN,
+            "species_name": SPECIES_NAME_HUMAN,
+            "file": f"parsers/flatfiles/{file}.txt",
+            "rel_file": "parsers/flatfiles/reactome_release.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing Reactome data"
+    assert (
+        f"{expected_processed} Reactome entries processed" in result_message
+    ), f"{prefix}Expected '{expected_processed} Reactome entries processed' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_dependent} dependent xrefs added" in result_message
+    ), f"{prefix}Expected '{expected_dependent} dependent xrefs added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_direct} direct xrefs added" in result_message
+    ), f"{prefix}Expected '{expected_direct} direct xrefs added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_errors} not found" in result_message
+    ), f"{prefix}Expected '{expected_errors} not found' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_reactome_no_source_id(reactome_parser: ReactomeParser, test_no_source_id: Callable[[ReactomeParser, int], None]) -> None:
+    test_no_source_id(reactome_parser, SPECIES_ID_HUMAN)
+
+def test_reactome_no_species_id(reactome_parser: ReactomeParser, test_no_species_id: Callable[[ReactomeParser, int], None]) -> None:
+    test_no_species_id(reactome_parser, SOURCE_ID_REACTOME)
+
+def test_reactome_no_file(reactome_parser: ReactomeParser, test_no_file: Callable[[ReactomeParser, int, int], None]) -> None:
+    test_no_file(reactome_parser, SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+
+# Test case to check if parsing is skipped when no species name can be found
+def test_no_species_name(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
+    result_code, result_message = reactome_parser.run(
+        {
+            "source_id": SOURCE_ID_REACTOME,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing Reactome data"
+    assert (
+        "Skipped. Could not find species ID to name mapping" in result_message
+    ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'"
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_reactome_missing_required_source_id(reactome_parser: ReactomeParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[ReactomeParser, DBConnection, str, int, int, str], None]) -> None:
+    reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
+    test_missing_required_source_id(reactome_parser, mock_xref_dbi, 'reactome', SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_reactome_file_not_found(reactome_parser: ReactomeParser, test_file_not_found: Callable[[ReactomeParser, int, int], None]) -> None:
+    reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
+    test_file_not_found(reactome_parser, SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_reactome_empty_file(reactome_parser: ReactomeParser, test_empty_file: Callable[[ReactomeParser, str, int, int], None]) -> None:
+    reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
+    test_empty_file(reactome_parser, 'Reactome', SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+
+# Test case to check successful parsing of valid Reactome data without existing uniprot xrefs
+def test_successful_parsing_without_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
+    populate_xref_db(mock_xref_dbi)
+
+    # Run and validate parsing for Uniprot and Ensembl Reactome files
+    run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 0, 0, 0)
+    run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_ensembl", 14, 0, 13, 1)
+
+    # Check the row counts in the xref and direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_TRANSCRIPT}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_DIRECT}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 6)
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 4)
+    check_row_count(mock_xref_dbi, "translation_direct_xref", 3)
+
+    # Check the link between an xref and direct_xref tables
+    check_direct_xref_link(mock_xref_dbi, "gene", "R-HSA-1643685", "ENSG00000000419")
+    check_direct_xref_link(mock_xref_dbi, "transcript", "R-HSA-199991", "ENST00000000233")
+    check_direct_xref_link(mock_xref_dbi, "translation", "R-HSA-199991", "ENSP00000000233")
+
+# Test case to check successful parsing of valid Reactome data with existing uniprot xrefs
+def test_successful_parsing_with_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
+    populate_xref_db(mock_xref_dbi)
+    reactome_parser.get_valid_codes = MagicMock(return_value={"A0A075B6P5": [12], "A0A075B6S6" : [34, 56], "A0A087WPF7": [78], "A0A096LNF2": [90]})
+ 
+    # Run and validate re-parsing for Uniprot and Ensembl Reactome files
+    run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 6, 0, 0, "Re-parsing: ")
+    run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_ensembl", 14, 0, 13, 1, "Re-parsing: ")
+
+    # Check the row counts in the xref, direct_xref, and dependent_xref tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_GENE}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_TRANSCRIPT}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_REACTOME_UNIPROT}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 6)
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 4)
+    check_row_count(mock_xref_dbi, "translation_direct_xref", 3)
+    check_row_count(mock_xref_dbi, "dependent_xref", 5)
+
+    # Check the link between an xref and dependent_xref
+    check_dependent_xref_link(mock_xref_dbi, "R-HSA-1280218", 34)
+    check_dependent_xref_link(mock_xref_dbi, "R-HSA-1280218", 56)
+    check_dependent_xref_link(mock_xref_dbi, "R-HSA-166663", 90)
+
+    # Check the release info
+    check_release(mock_xref_dbi, SOURCE_ID_REACTOME, "88")
diff --git a/src/python/test/xrefs/parsers/test_refseq_parser.py b/src/python/test/xrefs/parsers/test_refseq_parser.py
new file mode 100644
index 000000000..2b8a77f2c
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_refseq_parser.py
@@ -0,0 +1,243 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable, Dict
+from sqlalchemy import text
+
+from ensembl.production.xrefs.parsers.RefSeqParser import RefSeqParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_dependent_xref_link, check_sequence, check_release
+
+# Constants
+SOURCE_ID_REFSEQ = 1
+SOURCE_ID_REFSEQ_MRNA = 2
+SOURCE_ID_REFSEQ_NCRNA = 3
+SOURCE_ID_REFSEQ_MRNA_PREDICTED = 4
+SOURCE_ID_REFSEQ_NCRNA_PREDICTED = 5
+SOURCE_ID_REFSEQ_PEPTIDE = 6
+SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED = 7
+SOURCE_ID_ENTREZGENE = 8
+SOURCE_ID_WIKIGENE = 9
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create a RefSeqParser instance
+@pytest.fixture
+def refseq_parser() -> RefSeqParser:
+    return RefSeqParser(True)
+
+# Function to populate the database with EntrezGene and WikiGene xrefs
+def populate_xref_db(mock_xref_dbi: DBConnection):
+    source_data = [
+        [SOURCE_ID_REFSEQ_MRNA, 'RefSeq_mRNA', 10, 'refseq'],
+        [SOURCE_ID_REFSEQ_MRNA_PREDICTED, 'RefSeq_mRNA_predicted', 10, 'refseq'],
+        [SOURCE_ID_REFSEQ_NCRNA, 'RefSeq_ncRNA', 10, ''],
+        [SOURCE_ID_REFSEQ_NCRNA_PREDICTED, 'RefSeq_ncRNA_predicted', 10, ''],
+        [SOURCE_ID_REFSEQ_PEPTIDE, 'RefSeq_peptide', 10, ''],
+        [SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED, 'RefSeq_peptide_predicted', 10, ''],
+        [SOURCE_ID_ENTREZGENE, 'EntrezGene', 10, ''],
+        [SOURCE_ID_WIKIGENE, 'WikiGene', 10, ''],
+    ]
+    for row in source_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO source (source_id, name, ordered, priority_description)
+                VALUES (:source_id, :name, :ordered, :priority_description)
+                """
+            ),
+            {
+                "source_id": row[0],
+                "name": row[1],
+                "ordered": row[2],
+                "priority_description": row[3],
+            }
+        )
+
+    xref_data = [
+        [1, '105373289', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'LOC105373289'],
+        [2, '105373289', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'LOC105373289'],
+        [3, '100128640', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'ACVR2B-AS1'],
+        [4, '100128640', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'ACVR2B-AS1'],
+        [5, '102465874', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'MIR8075'],
+        [6, '102465874', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'MIR8075'],
+        [7, '401447', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L1'],
+        [8, '401447', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L1'],
+        [9, '728393', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L27'],
+        [10, '728393', SOURCE_ID_WIKIGENE, SPECIES_ID_HUMAN, 'DEPENDENT', 'USP17L27'],
+    ]
+    for row in xref_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO xref (xref_id, accession, source_id, species_id, info_type, label)
+                VALUES (:xref_id, :accession, :source_id, :species_id, :info_type, :label)
+                """
+            ),
+            {
+                "xref_id": row[0],
+                "accession": row[1],
+                "source_id": row[2],
+                "species_id": row[3],
+                "info_type": row[4],
+                "label": row[5],
+            }
+        )
+
+    mock_xref_dbi.commit()
+
+# Mock for get_source_id_for_source_name
+def mock_get_source_id_for_source_name(source_name: str, mock_xref_dbi: DBConnection, desc: str = None) -> int:
+    source_mapping = {
+        "RefSeq_peptide": SOURCE_ID_REFSEQ_PEPTIDE,
+        "RefSeq_mRNA": SOURCE_ID_REFSEQ_MRNA,
+        "RefSeq_ncRNA": SOURCE_ID_REFSEQ_NCRNA,
+        "RefSeq_peptide_predicted": SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED,
+        "RefSeq_mRNA_predicted": SOURCE_ID_REFSEQ_MRNA_PREDICTED,
+        "RefSeq_ncRNA_predicted": SOURCE_ID_REFSEQ_NCRNA_PREDICTED,
+        "EntrezGene": SOURCE_ID_ENTREZGENE,
+        "WikiGene": SOURCE_ID_WIKIGENE,
+    }
+    return source_mapping.get(source_name, SOURCE_ID_REFSEQ)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, file:str, expected_xrefs: Dict[str, int], prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = refseq_parser.run(
+        {
+            "source_id": SOURCE_ID_REFSEQ,
+            "species_id": SPECIES_ID_HUMAN,
+            "species_name": SPECIES_NAME_HUMAN,
+            "file": f"parsers/flatfiles/{file}.txt",
+            "rel_file": "parsers/flatfiles/refseq_release.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    mrna = expected_xrefs["num_mrna"]
+    mrna_pred = expected_xrefs["num_pred_mrna"]
+    ncrna = expected_xrefs["num_ncrna"]
+    ncrna_pred = expected_xrefs["num_pred_ncrna"]
+    peptide = expected_xrefs["num_peptide"]
+    peptide_pred = expected_xrefs["num_pred_peptide"]
+    entrez = expected_xrefs["num_entrez"]
+    wiki = expected_xrefs["num_wiki"]
+
+    assert result_code == 0, f"{prefix}Errors when parsing RefSeq GPFF data"
+    assert (
+        f"Added {mrna} mRNA xrefs, {mrna_pred} predicted mRNA xrefs," in result_message
+    ), f"{prefix}Expected 'Added {mrna} mRNA xrefs, {mrna_pred} predicted mRNA xrefs,' in result_message, but got: '{result_message}'"
+    assert (
+        f"{ncrna} ncRNA xrefs, {ncrna_pred} predicted ncRNA xrefs," in result_message
+    ), f"{prefix}Expected '{ncrna} ncRNA xrefs, {ncrna_pred} predicted ncRNA xrefs,' in result_message, but got: '{result_message}'"
+    assert (
+        f"{peptide} peptide xrefs, and {peptide_pred} predicted peptide xrefs" in result_message
+    ), f"{prefix}Expected '{peptide} peptide xrefs, and {peptide_pred} predicted peptide xref' in result_message, but got: '{result_message}'"
+    assert (
+        f"EntrezGene\t{entrez}" in result_message
+    ), f"{prefix}Expected 'EntrezGene\t{entrez}' in result_message, but got: '{result_message}'"
+    assert (
+        f"WikiGene\t{wiki}" in result_message
+    ), f"{prefix}Expected 'WikiGene\t{wiki}' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_refseq_no_source_id(refseq_parser: RefSeqParser, test_no_source_id: Callable[[RefSeqParser, int], None]) -> None:
+    test_no_source_id(refseq_parser, SPECIES_ID_HUMAN)
+
+def test_refseq_no_species_id(refseq_parser: RefSeqParser, test_no_species_id: Callable[[RefSeqParser, int], None]) -> None:
+    test_no_species_id(refseq_parser, SOURCE_ID_REFSEQ_MRNA)
+
+def test_refseq_no_file(refseq_parser: RefSeqParser, test_no_file: Callable[[RefSeqParser, int, int], None]) -> None:
+    test_no_file(refseq_parser, SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_mim2gene_missing_required_source_id(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RefSeqParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(refseq_parser, mock_xref_dbi, 'RefSeq_peptide', SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
+
+# Test case to check if parsing is skipped when no species name can be found
+def test_no_species_name(mock_xref_dbi: DBConnection, refseq_parser: RefSeqParser) -> None:
+    refseq_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    result_code, result_message = refseq_parser.run(
+        {
+            "source_id": SOURCE_ID_REFSEQ,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing RefSeq data"
+    assert (
+        "Skipped. Could not find species ID to name mapping" in result_message
+    ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'"
+
+# Test case to check if parsing is skipped if file type is not supported
+def test_invalid_file_type(mock_xref_dbi: DBConnection, refseq_parser: RefSeqParser) -> None:
+    refseq_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+
+    result_code, result_message = refseq_parser.run(
+        {
+            "source_id": SOURCE_ID_REFSEQ,
+            "species_id": SPECIES_ID_HUMAN,
+            "species_name": SPECIES_NAME_HUMAN,
+            "file": "dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing RefSeq data"
+    assert (
+        "Skipped. Could not work out sequence type" in result_message
+    ), f"Expected 'Skipped. Could not work out sequence type' in result_message, but got: '{result_message}'"
+
+# Test case to check if an error is raised when the file is not found
+def test_refseq_file_not_found(refseq_parser: RefSeqParser, test_file_not_found: Callable[[RefSeqParser, int, int], None]) -> None:
+    refseq_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
+    refseq_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
+    refseq_parser.type_from_file = MagicMock(return_value="dna")
+
+    test_file_not_found(refseq_parser, SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
+
+# Test case to check successful parsing of valid RefSeq GPFF data
+def test_successful_parsing(mock_xref_dbi: DBConnection, refseq_parser: RefSeqParser) -> None:
+    populate_xref_db(mock_xref_dbi)
+
+    # Check the row counts in the xref table before running the parser
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}")
+    check_row_count(mock_xref_dbi, "dependent_xref", 0)
+
+    # Run and validate parsing for RefSeq dna and peptide files
+    expected_counts = {"num_mrna": 5, "num_pred_mrna": 2, "num_ncrna": 2, "num_pred_ncrna": 1, "num_peptide": 0, "num_pred_peptide": 0, "num_entrez": 5, "num_wiki": 5}
+    run_and_validate_parsing(refseq_parser, mock_xref_dbi, "refseq_rna", expected_counts)
+    expected_counts = {"num_mrna": 0, "num_pred_mrna": 0, "num_ncrna": 0, "num_pred_ncrna": 0, "num_peptide": 5, "num_pred_peptide": 3, "num_entrez": 2, "num_wiki": 2}
+    run_and_validate_parsing(refseq_parser, mock_xref_dbi, "refseq_protein", expected_counts)
+
+    # Check the row counts in the xref, dependent_xref, and primary_xref tables
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_MRNA}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_MRNA_PREDICTED}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_NCRNA}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_NCRNA_PREDICTED}")
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_PEPTIDE}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ_PEPTIDE_PREDICTED}")
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_ENTREZGENE}")
+    check_row_count(mock_xref_dbi, "xref", 5, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_WIKIGENE}")
+    check_row_count(mock_xref_dbi, "dependent_xref", 16)
+    check_row_count(mock_xref_dbi, "primary_xref", 18)
+
+    # Check the link between an xref and dependent_xref
+    master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='NR_168385' AND source_id={SOURCE_ID_REFSEQ_NCRNA}")).scalar()
+    check_dependent_xref_link(mock_xref_dbi, "105373289", master_xref_id)
+    master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='NP_001229259' AND source_id={SOURCE_ID_REFSEQ_PEPTIDE}")).scalar()
+    check_dependent_xref_link(mock_xref_dbi, "728393", master_xref_id)
+    master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='NM_001242328' AND source_id={SOURCE_ID_REFSEQ_MRNA}")).scalar()
+    check_dependent_xref_link(mock_xref_dbi, "728393", master_xref_id)
+
+    # Check the sequences for specific accessions
+    check_sequence(mock_xref_dbi, "NM_039939", SOURCE_ID_REFSEQ_MRNA, "taaatgtcttactgcttttactgttccctcctagagtccattctttactctaggagggaatagtaaaagcagtaagacattta")
+    check_sequence(mock_xref_dbi, "NP_001355183", SOURCE_ID_REFSEQ_PEPTIDE, "mllmvvsmacvglflvqragphmggqdkpflsawpsavvprgghvtlrchyrhrfnnfmlykedrihvpifhgrifqegfnmspvttahagnytcrgshphsptgwsapsnpmvimvtgnhrwcsnkkkcccngpracreqk")
+
+    # Check the release info
+    check_release(mock_xref_dbi, SOURCE_ID_REFSEQ_MRNA, "NCBI Reference Sequence (RefSeq) Database Release 224, May 6, 2024")
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_rfam_parser.py b/src/python/test/xrefs/parsers/test_rfam_parser.py
new file mode 100644
index 000000000..86caa9669
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_rfam_parser.py
@@ -0,0 +1,130 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.RFAMParser import RFAMParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link
+
+# Constants
+SOURCE_ID_RFAM = 1
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create an RFAMParser instance
+@pytest.fixture
+def rfam_parser() -> RFAMParser:
+    return RFAMParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(rfam_parser: RFAMParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_direct_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = rfam_parser.run(
+        {
+            "source_id": SOURCE_ID_RFAM,
+            "species_id": SPECIES_ID_HUMAN,
+            "species_name": SPECIES_NAME_HUMAN,
+            "file": "parsers/flatfiles/rfam.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing RFAM data"
+    assert (
+        f"Added {expected_xrefs} RFAM xrefs and {expected_direct_xrefs} direct xrefs" in result_message
+    ), f"{prefix}Expected 'Added {expected_xrefs} RFAM xrefs and {expected_direct_xrefs} direct xrefs' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_rfam_no_source_id(rfam_parser: RFAMParser, test_no_source_id: Callable[[RFAMParser, int], None]) -> None:
+    test_no_source_id(rfam_parser, SPECIES_ID_HUMAN)
+
+def test_rfam_no_species_id(rfam_parser: RFAMParser, test_no_species_id: Callable[[RFAMParser, int], None]) -> None:
+    test_no_species_id(rfam_parser, SOURCE_ID_RFAM)
+
+def test_rfam_no_file(rfam_parser: RFAMParser, test_no_file: Callable[[RFAMParser, int, int], None]) -> None:
+    test_no_file(rfam_parser, SOURCE_ID_RFAM, SPECIES_ID_HUMAN)
+
+# Test case to check if parsing is skipped when no species name can be found
+def test_no_species_name(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None:
+    result_code, result_message = rfam_parser.run(
+        {
+            "source_id": SOURCE_ID_RFAM,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"Errors when parsing RFAM data"
+    assert (
+        "Skipped. Could not find species ID to name mapping" in result_message
+    ), f"Expected 'Skipped. Could not find species ID to name mapping' in result_message, but got: '{result_message}'"
+
+# Test case to check if an error is raised when no RFAM database is provided
+def test_no_rfam_db(rfam_parser: RFAMParser) -> None:
+    rfam_parser.get_db_from_registry = MagicMock(return_value=None)
+
+    with pytest.raises(
+        AttributeError, match="Could not find RFAM DB."
+    ):
+        rfam_parser.run(
+            {
+                "source_id": SOURCE_ID_RFAM,
+                "species_id": SPECIES_ID_HUMAN,
+                "species_name": SPECIES_NAME_HUMAN,
+                "file": "dummy_file.txt",
+                "ensembl_release": "100",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check if an error is raised when the file is not found
+def test_rfam_file_not_found(rfam_parser: RFAMParser, test_file_not_found: Callable[[RFAMParser, int, int], None]) -> None:
+    rfam_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
+    rfam_parser.get_rfam_db_url = MagicMock(return_value="mock_rfam_db_url")
+    rfam_parser.get_rfam_transcript_stable_ids = MagicMock(return_value={})
+    test_file_not_found(rfam_parser, SOURCE_ID_RFAM, SPECIES_ID_HUMAN)
+
+# Test case to check successful parsing of valid RFAM data without existing RFAM xrefs in RFAM DB
+def test_successful_parsing_without_existing_rfam_data(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None:
+    rfam_parser.get_rfam_db_url = MagicMock(return_value="mock_rfam_db_url")
+    rfam_parser.get_rfam_transcript_stable_ids = MagicMock(return_value={})
+
+    # Run and validate parsing for MIM file
+    run_and_validate_parsing(rfam_parser, mock_xref_dbi, 0, 0)
+
+    # Check the row counts in the xref and transcript_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 0)
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 0)
+
+# Test case to check successful parsing of valid RFAM data with existing RFAM xrefs in RFAM DB
+def test_successful_parsing_with_existing_rfam_data(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None:
+    # Run parsing without existing values in RFAM DB
+    rfam_parser.get_rfam_db_url = MagicMock(return_value="mock_rfam_db_url")
+    rfam_parser.get_rfam_transcript_stable_ids = MagicMock(return_value={
+        "RF00001": ["ENST00000516887", "ENST00000516971", "ENST00000622298", "ENST00000674448"],
+        "RF00002": ["ENST00000363564", "ENST00000515896"],
+        "RF00003": ["ENST00000353977"],
+        "RF00006": ["ENST00000362552", "ENST00000363120", "ENST00000365241", "ENST00000365645", "ENST00000516091"]
+    })
+
+    # Run and validate parsing for RFAM file
+    run_and_validate_parsing(rfam_parser, mock_xref_dbi, 4, 12)
+
+    # Check the row counts in the xref and transcript_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 4)
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 12)
+
+    # Check the link between an xref and transcript_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "transcript", "RF00002", "ENST00000515896")
+    check_direct_xref_link(mock_xref_dbi, "transcript", "RF00006", "ENST00000362552")
+    check_direct_xref_link(mock_xref_dbi, "transcript", "RF00006", "ENST00000365645")
+
+    # Run and validate re-parsing for RFAM file
+    run_and_validate_parsing(rfam_parser, mock_xref_dbi, 4, 12, "Re-parsing: ")
+
+    # Check the row counts in the xref and transcript_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 4)
+    check_row_count(mock_xref_dbi, "transcript_direct_xref", 12)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_rgd_parser.py b/src/python/test/xrefs/parsers/test_rgd_parser.py
new file mode 100644
index 000000000..2b8019c3f
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_rgd_parser.py
@@ -0,0 +1,126 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.RGDParser import RGDParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_synonym
+
+# Constants
+SOURCE_ID_RGD = 1
+SOURCE_ID_DIRECT = 2
+SPECIES_ID_RAT = 10116
+
+# Fixture to create an RGDParser instance
+@pytest.fixture
+def rgd_parser() -> RGDParser:
+    return RGDParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(rgd_parser: RGDParser, mock_xref_dbi: DBConnection, expected_dependent_xrefs: int, expected_direct_xrefs: int, expected_mismatch: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = rgd_parser.run(
+        {
+            "source_id": SOURCE_ID_RGD,
+            "species_id": SPECIES_ID_RAT,
+            "file": "parsers/flatfiles/rgd.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing RGD data"
+    assert (
+        f"{expected_dependent_xrefs} xrefs successfully loaded and dependent on refseq" in result_message
+    ), f"{prefix}Expected '{expected_dependent_xrefs} xrefs successfully loaded and dependent on refseq' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_mismatch} xrefs added but with NO dependencies" in result_message
+    ), f"{prefix}Expected '{expected_mismatch} xrefs added but with NO dependencies' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_direct_xrefs} direct xrefs successfully loaded" in result_message
+    ), f"{prefix}Expected '{expected_direct_xrefs} direct xrefs successfully loaded' in result_message, but got: '{result_message}'"
+    assert (
+        f"Added {expected_synonyms} synonyms, including duplicates" in result_message
+    ), f"{prefix}Expected 'Added {expected_synonyms} synonyms, including duplicates' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_rgd_no_source_id(rgd_parser: RGDParser, test_no_source_id: Callable[[RGDParser, int], None]) -> None:
+    test_no_source_id(rgd_parser, SPECIES_ID_RAT)
+
+def test_rgd_no_species_id(rgd_parser: RGDParser, test_no_species_id: Callable[[RGDParser, int], None]) -> None:
+    test_no_species_id(rgd_parser, SOURCE_ID_RGD)
+
+def test_rgd_no_file(rgd_parser: RGDParser, test_no_file: Callable[[RGDParser, int, int], None]) -> None:
+    test_no_file(rgd_parser, SOURCE_ID_RGD, SPECIES_ID_RAT)
+
+# Test case to check if an error is raised when the file is not found
+def test_rgd_file_not_found(rgd_parser: RGDParser, test_file_not_found: Callable[[RGDParser, int, int], None]) -> None:
+    rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT)
+    test_file_not_found(rgd_parser, SOURCE_ID_RGD, SPECIES_ID_RAT)
+
+# Test case to check if an error is raised when the file is empty
+def test_rgd_empty_file(rgd_parser: RGDParser, test_empty_file: Callable[[RGDParser, str, int, int], None]) -> None:
+    rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT)
+    test_empty_file(rgd_parser, 'RGD', SOURCE_ID_RGD, SPECIES_ID_RAT)
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_rgd_missing_required_source_id(rgd_parser: RGDParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RGDParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(rgd_parser, mock_xref_dbi, 'RGD', SOURCE_ID_RGD, SPECIES_ID_RAT)
+
+# Test case to check successful parsing of valid RGD data without existing refseqs
+def test_successful_parsing_without_refseqs(mock_xref_dbi: DBConnection, rgd_parser: RGDParser) -> None:
+    rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT)
+
+    # Run and validate parsing for RGD file without existing refseqs
+    run_and_validate_parsing(rgd_parser, mock_xref_dbi, 0, 5, 2, 6)
+
+    # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 0, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_RGD}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='MISC' AND source_id={SOURCE_ID_RGD}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 5)
+    check_row_count(mock_xref_dbi, "dependent_xref", 0)
+    check_row_count(mock_xref_dbi, "synonym", 4)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "2004", "ENSRNOG00000028896")
+
+# Test case to check successful parsing of valid RGD data with refseqs
+def test_successful_parsing_with_refseqs(mock_xref_dbi: DBConnection, rgd_parser: RGDParser) -> None:
+    rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT)
+    rgd_parser.get_valid_codes = MagicMock(return_value={"NM_052979": [12, 34], "XM_039101774" : [56], "XM_063281326": [78]})
+
+    # Run and validate parsing for RGD file with existing refseqs
+    run_and_validate_parsing(rgd_parser, mock_xref_dbi, 3, 5, 1, 12)
+
+    # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_RGD}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_RGD}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 5)
+    check_row_count(mock_xref_dbi, "dependent_xref", 3)
+    check_row_count(mock_xref_dbi, "synonym", 8)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "2012", "ENSRNOG00000009845")
+
+    # Check the link between an xref and dependent_xref
+    check_dependent_xref_link(mock_xref_dbi, "2003", 12)
+    check_dependent_xref_link(mock_xref_dbi, "2003", 34)
+    check_dependent_xref_link(mock_xref_dbi, "2007", 56)
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "2003", SOURCE_ID_DIRECT, "ASP")
+    check_synonym(mock_xref_dbi, "2007", SOURCE_ID_RGD, "PMP70, 70-kDa peroxisomal membrane protein")
+
+    # Run and validate re-parsing for RGD file
+    run_and_validate_parsing(rgd_parser, mock_xref_dbi, 3, 5, 1, 12, "Re-parsing: ")
+
+    # Check the row counts in the xref, gene_direct_xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_RGD}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='MISC' AND source_id={SOURCE_ID_RGD}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 5)
+    check_row_count(mock_xref_dbi, "dependent_xref", 3)
+    check_row_count(mock_xref_dbi, "synonym", 8)
diff --git a/src/python/test/xrefs/parsers/test_ucsc_parser.py b/src/python/test/xrefs/parsers/test_ucsc_parser.py
new file mode 100644
index 000000000..ae96e4d3f
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_ucsc_parser.py
@@ -0,0 +1,89 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.UCSCParser import UCSCParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count
+
+# Constants
+SOURCE_ID_UCSC = 1
+SPECIES_ID_HUMAN = 9606
+
+# Fixture to create a UCSCParser instance
+@pytest.fixture
+def ucsc_parser() -> UCSCParser:
+    return UCSCParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(ucsc_parser: UCSCParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = ucsc_parser.run(
+        {
+            "source_id": SOURCE_ID_UCSC,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": "parsers/flatfiles/ucsc.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing UCSC data"
+    assert (
+        f"Loaded a total of {expected_xrefs} UCSC xrefs" in result_message
+    ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} UCSC xrefs' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_ucsc_no_source_id(ucsc_parser: UCSCParser, test_no_source_id: Callable[[UCSCParser, int], None]) -> None:
+    test_no_source_id(ucsc_parser, SPECIES_ID_HUMAN)
+
+def test_ucsc_no_species_id(ucsc_parser: UCSCParser, test_no_species_id: Callable[[UCSCParser, int], None]) -> None:
+    test_no_species_id(ucsc_parser, SOURCE_ID_UCSC)
+
+def test_ucsc_no_file(ucsc_parser: UCSCParser, test_no_file: Callable[[UCSCParser, int, int], None]) -> None:
+    test_no_file(ucsc_parser, SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_ucsc_file_not_found(ucsc_parser: UCSCParser, test_file_not_found: Callable[[UCSCParser, int, int], None]) -> None:
+    test_file_not_found(ucsc_parser, SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is empty
+def test_ucsc_empty_file(ucsc_parser: UCSCParser, test_empty_file: Callable[[UCSCParser, str, int, int], None]) -> None:
+    test_empty_file(ucsc_parser, 'UCSC', SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
+
+# Parametrized test case to check if an error is raised for various missing keys
+@pytest.mark.parametrize(
+    "line", [
+        ("ENST00000619216.1\tchr1\t-\t17368\t17436\t17368\t17368\t1\t17368,\t17436,\t\t\n"),
+        ("ENST00000619216.1\t \t-\t17368\t17436\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"),
+        ("ENST00000619216.1\tchr1\t\t17368\t17436\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"),
+        ("ENST00000619216.1\tchr1\t-\t\t17436\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"),
+        ("ENST00000619216.1\tchr1\t-\t17368\t\t17368\t17368\t1\t17368,\t17436,\t\tuc031tla.1\n"),
+        ("ENST00000619216.1\tchr1\t-\t17368\t17436\t17368\t17368\t1\t\t17436,\t\tuc031tla.1\n"),
+        ("ENST00000619216.1\tchr1\t-\t17368\t17436\t17368\t17368\t1\t17368,\t  \t\tuc031tla.1\n"),
+    ],
+    ids=["accession column", "chromosome column", "strand column", "txStart column", "txEnd column", "exonStarts column", "exonEnds column"],
+)
+def test_missing_keys(ucsc_parser: UCSCParser, line: str) -> None:
+    mock_file = io.StringIO(line)
+    ucsc_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Missing required key for xref"):
+        ucsc_parser.run(
+            {
+                "source_id": SOURCE_ID_UCSC,
+                "species_id": SPECIES_ID_HUMAN,
+                "file": "dummy_file.txt",
+                "xref_dbi": MagicMock(),
+            }
+        )
+
+# Test case to check successful parsing of valid UCSC data
+def test_successful_parsing(mock_xref_dbi: DBConnection, ucsc_parser: UCSCParser) -> None:
+    # Run and validate parsing for UCSC file
+    run_and_validate_parsing(ucsc_parser, mock_xref_dbi, 10)
+
+    # Check the row counts in the coordinate_xref table
+    check_row_count(mock_xref_dbi, "coordinate_xref", 10)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_uniprot_parser.py b/src/python/test/xrefs/parsers/test_uniprot_parser.py
new file mode 100644
index 000000000..0cf0e2cc7
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_uniprot_parser.py
@@ -0,0 +1,181 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable, Dict
+from sqlalchemy import text
+
+from ensembl.production.xrefs.parsers.UniProtParser import UniProtParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_synonym, check_direct_xref_link, check_dependent_xref_link, check_sequence, check_release
+
+# Constants
+SOURCE_ID_UNIPROT = 1
+SOURCE_ID_SWISSPROT = 2
+SOURCE_ID_TREMBL = 3
+SOURCE_ID_TREMBL_NON_DISPLAY = 4
+SOURCE_ID_SWISSPROT_DIRECT = 5
+SOURCE_ID_TREMBL_DIRECT = 6
+SOURCE_ID_ISOFORM = 7
+SOURCE_ID_PDB = 8
+SOURCE_ID_STRING = 9
+SOURCE_ID_EMBL = 10
+SOURCE_ID_BIOGRID = 11
+SOURCE_ID_CHEMBL = 12
+SOURCE_ID_UNIPROT_GN = 13
+SOURCE_ID_PROTEIN_ID = 14
+SPECIES_ID_HUMAN = 9606
+SPECIES_NAME_HUMAN = "homo_sapiens"
+
+# Fixture to create a UniProtParser instance
+@pytest.fixture
+def uniprot_parser() -> UniProtParser:
+    return UniProtParser(True)
+
+# Function to populate the database with sources
+def populate_xref_db(mock_xref_dbi: DBConnection):
+    source_data = [
+        [SOURCE_ID_SWISSPROT, 'Uniprot/SWISSPROT', 10, 'sequence_mapped'],
+        [SOURCE_ID_TREMBL, 'Uniprot/SPTREMBL', 10, 'sequence_mapped'],
+        [SOURCE_ID_TREMBL_NON_DISPLAY, 'Uniprot/SPTREMBL', 10, 'protein_evidence_gt_2'],
+        [SOURCE_ID_SWISSPROT_DIRECT, 'Uniprot/SWISSPROT', 10, 'direct'],
+        [SOURCE_ID_TREMBL_DIRECT, 'Uniprot/SPTREMBL', 10, 'direct'],
+        [SOURCE_ID_ISOFORM, 'Uniprot_isoform', 10, ''],
+        [SOURCE_ID_PDB, 'PDB', 10, ''],
+        [SOURCE_ID_STRING, 'STRING', 10, ''],
+        [SOURCE_ID_EMBL, 'EMBL', 10, ''],
+        [SOURCE_ID_BIOGRID, 'BioGRID', 10, ''],
+        [SOURCE_ID_CHEMBL, 'ChEMBL', 10, ''],
+        [SOURCE_ID_UNIPROT_GN, 'Uniprot_gn', 10, ''],
+        [SOURCE_ID_PROTEIN_ID, 'protein_id', 10, ''],
+    ]
+    for row in source_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO source (source_id, name, ordered, priority_description)
+                VALUES (:source_id, :name, :ordered, :priority_description)
+                """
+            ),
+            {
+                "source_id": row[0],
+                "name": row[1],
+                "ordered": row[2],
+                "priority_description": row[3],
+            }
+        )
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, file:str, expected_xrefs: Dict[str, int], expected_deps: Dict[str, int], prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = uniprot_parser.run(
+        {
+            "source_id": SOURCE_ID_UNIPROT,
+            "species_id": SPECIES_ID_HUMAN,
+            "file": f"parsers/flatfiles/{file}.txt",
+            "rel_file": "parsers/flatfiles/uniprot_release.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    sp = expected_xrefs["num_sp"]
+    sptr = expected_xrefs["num_sptr"]
+    sptr_non_display = expected_xrefs["num_sptr_non_display"]
+    direct_sp = expected_xrefs["num_direct_sp"]
+    direct_sptr = expected_xrefs["num_direct_sptr"]
+    isoform = expected_xrefs["num_isoform"]
+    skipped = expected_xrefs["num_skipped"]
+
+    assert result_code == 0, f"{prefix}Errors when parsing UniProt data"
+    assert (
+        f"Read {sp} SwissProt xrefs, {sptr} SPTrEMBL xrefs with protein evidence codes 1-2," in result_message
+    ), f"{prefix}Expected 'Read {sp} SwissProt xrefs, {sptr} SPTrEMBL xrefs with protein evidence codes 1-2,' in result_message, but got: '{result_message}'"
+    assert (
+        f"and {sptr_non_display} SPTrEMBL xrefs with protein evidence codes > 2 from" in result_message
+    ), f"{prefix}Expected 'and {sptr_non_display} SPTrEMBL xrefs with protein evidence codes > 2 from' in result_message, but got: '{result_message}'"
+    assert (
+        f"Added {direct_sp} direct SwissProt xrefs and {direct_sptr} direct SPTrEMBL xrefs" in result_message
+    ), f"{prefix}Expected 'Added {direct_sp} direct SwissProt xrefs and {direct_sptr} direct SPTrEMBL xrefs' in result_message, but got: '{result_message}'"
+    assert (
+        f"Added {isoform} direct isoform xrefs" in result_message
+    ), f"{prefix}Expected 'Added {isoform} direct isoform xrefs' in result_message, but got: '{result_message}'"
+    assert (
+        f"Skipped {skipped} ensembl annotations as Gene names" in result_message
+    ), f"{prefix}Expected 'Skipped {skipped} ensembl annotations as Gene names' in result_message, but got: '{result_message}'"
+
+    for count_type, count in expected_deps.items():
+        assert f"{count_type}\t{count}" in result_message, f"{prefix}Expected '{count_type}\t{count}' in result_meesgae, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_uniprot_no_source_id(uniprot_parser: UniProtParser, test_no_source_id: Callable[[UniProtParser, int], None]) -> None:
+    test_no_source_id(uniprot_parser, SPECIES_ID_HUMAN)
+
+def test_uniprot_no_species_id(uniprot_parser: UniProtParser, test_no_species_id: Callable[[UniProtParser, int], None]) -> None:
+    test_no_species_id(uniprot_parser, SOURCE_ID_UNIPROT)
+
+def test_uniprot_no_file(uniprot_parser: UniProtParser, test_no_file: Callable[[UniProtParser, int, int], None]) -> None:
+    test_no_file(uniprot_parser, SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_uniprot_missing_required_source_id(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[UniProtParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(uniprot_parser, mock_xref_dbi, 'Uniprot/SWISSPROT', SOURCE_ID_SWISSPROT, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the file is not found
+def test_uniprot_file_not_found(uniprot_parser: UniProtParser, test_file_not_found: Callable[[UniProtParser, int, int], None]) -> None:
+    test_file_not_found(uniprot_parser, SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN)
+
+# Test case to check successful parsing of valid UniProt data
+def test_successful_parsing(mock_xref_dbi: DBConnection, uniprot_parser: UniProtParser) -> None:
+    populate_xref_db(mock_xref_dbi)
+
+    # Run and validate parsing for UniProt SWISSPROT file
+    expected_counts = {"num_sp": 4, "num_sptr": 0, "num_sptr_non_display": 0, "num_direct_sp": 8, "num_direct_sptr": 0, "num_isoform": 6, "num_skipped": 1}
+    expected_deps = {"PDB": 50, "STRING": 4, "EMBL": 34, "BioGRID": 4, "ChEMBL": 4, "protein_id": 34, "Uniprot_gn": 3}
+    run_and_validate_parsing(uniprot_parser, mock_xref_dbi, "uniprot_swissprot", expected_counts, expected_deps)
+
+    # Run and validate parsing for UniProt TREMBL file
+    expected_counts = {"num_sp": 0, "num_sptr": 1, "num_sptr_non_display": 8, "num_direct_sp": 0, "num_direct_sptr": 0, "num_isoform": 0, "num_skipped": 0}
+    expected_deps = {"EMBL": 49, "protein_id": 49, "Uniprot_gn": 7}
+    run_and_validate_parsing(uniprot_parser, mock_xref_dbi, "uniprot_trembl", expected_counts, expected_deps)
+
+    # Check the row counts in the xref, translation_direct_xref, dependent_xref, primary_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_SWISSPROT}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_SWISSPROT_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 1, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_TREMBL}")
+    check_row_count(mock_xref_dbi, "xref", 0, f"info_type='DIRECT' AND source_id={SOURCE_ID_TREMBL_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 8, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_TREMBL_NON_DISPLAY}")
+    check_row_count(mock_xref_dbi, "xref", 49, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_PDB}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_STRING}")
+    check_row_count(mock_xref_dbi, "xref", 83, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_EMBL}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_BIOGRID}")
+    check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_CHEMBL}")
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_UNIPROT_GN}")
+    check_row_count(mock_xref_dbi, "xref", 83, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_PROTEIN_ID}")
+    check_row_count(mock_xref_dbi, "translation_direct_xref", 14)
+    check_row_count(mock_xref_dbi, "dependent_xref", 238)
+    check_row_count(mock_xref_dbi, "primary_xref", 13)
+    check_row_count(mock_xref_dbi, "synonym", 16)
+
+    # Check the link between an xref and translation_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "translation", "P62258", "ENSP00000461762")
+    check_direct_xref_link(mock_xref_dbi, "translation", "P31946-1", "ENSP00000361930")
+
+    # Check the link between an xref and dependent_xref
+    master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='Q4F4R7' AND source_id={SOURCE_ID_TREMBL_NON_DISPLAY}")).scalar()
+    check_dependent_xref_link(mock_xref_dbi, "DQ305032", master_xref_id)
+    check_dependent_xref_link(mock_xref_dbi, "AGQ46203", master_xref_id)
+    master_xref_id = mock_xref_dbi.execute(text(f"SELECT xref_id FROM xref WHERE accession='P62258' AND source_id={SOURCE_ID_SWISSPROT}")).scalar()
+    check_dependent_xref_link(mock_xref_dbi, "6EIH", master_xref_id)
+
+    # Check the sequences for specific accessions
+    check_sequence(mock_xref_dbi, "Q04917", SOURCE_ID_SWISSPROT, "MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWEAGEGN")
+    check_sequence(mock_xref_dbi, "A0A7D5YZ42", SOURCE_ID_TREMBL_NON_DISPLAY, "LSKVYGPVFTLYFGLKPIVVLHGYEAVKEALIDLGEEFSGRGIFPLAERANRGFGIVFSNGKKWKEIRHFSLMTLRNFGMGKRSIEDRVQEEARCLVEELRKTKGG")
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "P62258", SOURCE_ID_UNIPROT_GN, "YWHAE1")
+    check_synonym(mock_xref_dbi, "P61981", SOURCE_ID_SWISSPROT, "P35214")
+    check_synonym(mock_xref_dbi, "P61981", SOURCE_ID_SWISSPROT, "Q9UDP2")
+
+    # Check the release info
+    check_release(mock_xref_dbi, SOURCE_ID_SWISSPROT, "UniProtKB/Swiss-Prot Release 2024_03 of 29-May-2024")
+    check_release(mock_xref_dbi, SOURCE_ID_TREMBL, "UniProtKB/TrEMBL Release 2024_03 of 29-May-2024")
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_vgnc_parser.py b/src/python/test/xrefs/parsers/test_vgnc_parser.py
new file mode 100644
index 000000000..6ebe58d8d
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_vgnc_parser.py
@@ -0,0 +1,96 @@
+import pytest
+import io
+from unittest.mock import MagicMock
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.VGNCParser import VGNCParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link, check_synonym
+
+# Constants
+SOURCE_ID_VGNC = 1
+SPECIES_ID_P_TROGLODYTES = 9598
+
+# Fixture to create a VGNCParser instance
+@pytest.fixture
+def vgnc_parser() -> VGNCParser:
+    return VGNCParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(vgnc_parser: VGNCParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = vgnc_parser.run(
+        {
+            "source_id": SOURCE_ID_VGNC,
+            "species_id": SPECIES_ID_P_TROGLODYTES,
+            "file": "parsers/flatfiles/vgnc.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing VGNC data"
+    assert (
+        f"Loaded a total of {expected_xrefs} VGNC xrefs and added {expected_synonyms} synonyms" in result_message
+    ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} VGNC xrefs and added {expected_synonyms} synonyms' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_vgnc_no_source_id(vgnc_parser: VGNCParser, test_no_source_id: Callable[[VGNCParser, int], None]) -> None:
+    test_no_source_id(vgnc_parser, SPECIES_ID_P_TROGLODYTES)
+
+def test_vgnc_no_species_id(vgnc_parser: VGNCParser, test_no_species_id: Callable[[VGNCParser, int], None]) -> None:
+    test_no_species_id(vgnc_parser, SOURCE_ID_VGNC)
+
+def test_vgnc_no_file(vgnc_parser: VGNCParser, test_no_file: Callable[[VGNCParser, int, int], None]) -> None:
+    test_no_file(vgnc_parser, SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
+
+# Test case to check if an error is raised when the file is not found
+def test_vgnc_file_not_found(vgnc_parser: VGNCParser, test_file_not_found: Callable[[VGNCParser, int, int], None]) -> None:
+    test_file_not_found(vgnc_parser, SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
+
+# Test case to check if an error is raised when the file is empty
+def test_vgnc_empty_file(vgnc_parser: VGNCParser, test_empty_file: Callable[[VGNCParser, str, int, int], None]) -> None:
+    test_empty_file(vgnc_parser, 'VGNC', SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
+
+# Test case to check if an error is raised when required columns are missing
+def test_missing_columns(vgnc_parser: VGNCParser, mock_xref_dbi: DBConnection) -> None:
+    mock_file = io.StringIO("taxon_id\tvgnc_id\tsymbol\tname\tlocus_group\tlocus_type\tstatus\tlocation\tlocation_sortable:\talias_symbol\talias_name\tprev_symbol\tprev_name\tgene_family\tgene_family_id\tdate_approved_reserved\tdate_symbol_changed\tdate_name_changed\tdate_modified\tentrez_id\tuniprot_ids\n")
+    vgnc_parser.get_filehandle = MagicMock(return_value=mock_file)
+
+    with pytest.raises(ValueError, match="Can't find required columns in VGNC file"):
+        vgnc_parser.run(
+            {
+                "source_id": SOURCE_ID_VGNC,
+                "species_id": SPECIES_ID_P_TROGLODYTES,
+                "file": "dummy_file.txt",
+                "xref_dbi": mock_xref_dbi,
+            }
+        )
+
+# Test case to check successful parsing of valid VGNC data
+def test_successful_parsing(mock_xref_dbi: DBConnection, vgnc_parser: VGNCParser) -> None:
+    vgnc_parser.species_id_to_taxonomy = MagicMock(return_value={})
+
+    # Run and validate parsing for VGNC file
+    run_and_validate_parsing(vgnc_parser, mock_xref_dbi, 6, 2)
+
+    # Check the row counts in the xref, gene_direct_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_VGNC}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 6)
+    check_row_count(mock_xref_dbi, "synonym", 2)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "VGNC:14660", "ENSPTRG00000013870")
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "VGNC:14659", SOURCE_ID_VGNC, "TEST_SYNONYM")
+    check_synonym(mock_xref_dbi, "VGNC:3738", SOURCE_ID_VGNC, "DIP2")
+
+    # Run and validate re-parsing for VGNC file
+    run_and_validate_parsing(vgnc_parser, mock_xref_dbi, 6, 2)
+
+    # Check the row counts in the xref, gene_direct_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_VGNC}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 6)
+    check_row_count(mock_xref_dbi, "synonym", 2)
diff --git a/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py
new file mode 100644
index 000000000..dda0c7bdc
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py
@@ -0,0 +1,78 @@
+import pytest
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.XenopusJamboreeParser import XenopusJamboreeParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link, check_description
+
+# Constants
+SOURCE_ID_XENOPUS_JAMBOREE = 1
+SPECIES_ID_XENOPUS = 8364
+
+# Fixture to create a XenopusJamboreeParser instance
+@pytest.fixture
+def xenopus_jamboree_parser() -> XenopusJamboreeParser:
+    return XenopusJamboreeParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(xenopus_jamboree_parser: XenopusJamboreeParser, mock_xref_dbi: DBConnection, expected_xrefs: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = xenopus_jamboree_parser.run(
+        {
+            "source_id": SOURCE_ID_XENOPUS_JAMBOREE,
+            "species_id": SPECIES_ID_XENOPUS,
+            "file": "parsers/flatfiles/xenopus_jamboree.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing Xenopus Jamboree data"
+    assert (
+        f"{expected_xrefs} XenopusJamboree xrefs successfully parsed" in result_message
+    ), f"{prefix}Expected '{expected_xrefs} XenopusJamboree xrefs successfully parsed' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_xenopus_jamboree_no_source_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_source_id: Callable[[XenopusJamboreeParser, int], None]) -> None:
+    test_no_source_id(xenopus_jamboree_parser, SPECIES_ID_XENOPUS)
+
+def test_xenopus_jamboree_no_species_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_species_id: Callable[[XenopusJamboreeParser, int], None]) -> None:
+    test_no_species_id(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE)
+
+def test_xenopus_jamboree_no_file(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_file: Callable[[XenopusJamboreeParser, int, int], None]) -> None:
+    test_no_file(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
+
+# Test case to check if an error is raised when the file is not found
+def test_xenopus_jamboree_file_not_found(xenopus_jamboree_parser: XenopusJamboreeParser, test_file_not_found: Callable[[XenopusJamboreeParser, int, int], None]) -> None:
+    test_file_not_found(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
+
+# Test case to check if an error is raised when the file is empty
+def test_xenopus_jamboree_empty_file(xenopus_jamboree_parser: XenopusJamboreeParser, test_empty_file: Callable[[XenopusJamboreeParser, str, int, int], None]) -> None:
+    test_empty_file(xenopus_jamboree_parser, 'XenopusJamboree', SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
+
+# Test case to check successful parsing of valid Xenopus Jamboree data
+def test_successful_parsing(mock_xref_dbi: DBConnection, xenopus_jamboree_parser: XenopusJamboreeParser) -> None:
+    # Run and validate parsing for Xenopus Jamboree file
+    run_and_validate_parsing(xenopus_jamboree_parser, mock_xref_dbi, 12)
+
+    # Check the row counts in the xref and gene_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 12, f"info_type='DIRECT' AND source_id={SOURCE_ID_XENOPUS_JAMBOREE}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 12)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "XB-GENE-478064", "ENSXETG00000005286")
+    check_direct_xref_link(mock_xref_dbi, "gene", "XB-GENE-478141", "ENSXETG00000025664")
+
+    # Check if provenance information correctly removed from descriptions
+    check_description(mock_xref_dbi, "XB-GENE-940866", "receptor (chemosensory) transporter protein 3 gene C")
+
+    # Check if "X of Y" labels correctly removed from descriptions
+    check_description(mock_xref_dbi, "XB-GENE-981482", "conserved hypothetical olfactory receptor")
+
+    # Run and validate re-parsing for Xenopus Jamboree file
+    run_and_validate_parsing(xenopus_jamboree_parser, mock_xref_dbi, 12, "Re-parsing: ")
+
+    # Check the row counts in the xref and gene_direct_xref tables
+    check_row_count(mock_xref_dbi, "xref", 12, f"info_type='DIRECT' AND source_id={SOURCE_ID_XENOPUS_JAMBOREE}")
+    check_row_count(mock_xref_dbi, "gene_direct_xref", 12)
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_zfin_desc_parser.py b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py
new file mode 100644
index 000000000..1ef373c46
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py
@@ -0,0 +1,63 @@
+import pytest
+from typing import Callable
+
+from ensembl.production.xrefs.parsers.ZFINDescParser import ZFINDescParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count
+
+# Constants
+SOURCE_ID_ZFIN = 1
+SPECIES_ID_ZEBRAFISH = 7955
+
+# Fixture to create a ZFINDescParser instance
+@pytest.fixture
+def zfin_desc_parser() -> ZFINDescParser:
+    return ZFINDescParser(True)
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(zfin_desc_parser: ZFINDescParser, mock_xref_dbi: DBConnection, expected_xrefs: int, expected_withdrawn: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = zfin_desc_parser.run(
+        {
+            "source_id": SOURCE_ID_ZFIN,
+            "species_id": SPECIES_ID_ZEBRAFISH,
+            "file": "parsers/flatfiles/zfin_desc.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing ZFINDesc data"
+    assert (
+        f"{expected_xrefs} ZFINDesc xrefs added" in result_message
+    ), f"{prefix}Expected '{expected_xrefs} ZFINDesc xrefs added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_withdrawn} withdrawn entries ignored" in result_message
+    ), f"{prefix}Expected '{expected_withdrawn} withdrawn entries ignored' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_zfin_desc_no_source_id(zfin_desc_parser: ZFINDescParser, test_no_source_id: Callable[[ZFINDescParser, int], None]) -> None:
+    test_no_source_id(zfin_desc_parser, SPECIES_ID_ZEBRAFISH)
+
+def test_zfin_desc_no_species_id(zfin_desc_parser: ZFINDescParser, test_no_species_id: Callable[[ZFINDescParser, int], None]) -> None:
+    test_no_species_id(zfin_desc_parser, SOURCE_ID_ZFIN)
+
+def test_zfin_desc_no_file(zfin_desc_parser: ZFINDescParser, test_no_file: Callable[[ZFINDescParser, int, int], None]) -> None:
+    test_no_file(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+
+# Test case to check if an error is raised when the file is not found
+def test_zfin_desc_file_not_found(zfin_desc_parser: ZFINDescParser, test_file_not_found: Callable[[ZFINDescParser, int, int], None]) -> None:
+    test_file_not_found(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+
+# Test case to check if an error is raised when the file is empty
+def test_zfin_desc_empty_file(zfin_desc_parser: ZFINDescParser, test_empty_file: Callable[[ZFINDescParser, str, int, int], None]) -> None:
+    test_empty_file(zfin_desc_parser, 'ZFINDesc', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+
+# Test case to check successful parsing of valid ZFINDesc data
+def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_desc_parser: ZFINDescParser) -> None:
+    # Run and validate parsing for ZFINDesc file
+    run_and_validate_parsing(zfin_desc_parser, mock_xref_dbi, 6, 3)
+
+    # Check the row counts in the xref table
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='MISC' AND source_id={SOURCE_ID_ZFIN}")
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_zfin_parser.py b/src/python/test/xrefs/parsers/test_zfin_parser.py
new file mode 100644
index 000000000..060ffa2bc
--- /dev/null
+++ b/src/python/test/xrefs/parsers/test_zfin_parser.py
@@ -0,0 +1,165 @@
+import pytest
+from unittest.mock import MagicMock
+from typing import Callable
+from sqlalchemy import text
+
+from ensembl.production.xrefs.parsers.ZFINParser import ZFINParser
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_direct_xref_link, check_dependent_xref_link, check_synonym, check_description
+
+# Constants
+SOURCE_ID_ZFIN = 1
+SOURCE_ID_DIRECT = 2
+SOURCE_ID_DEPENDENT = 3
+SOURCE_ID_DESCRIPTION = 4
+SOURCE_ID_UNIPROT = 5
+SOURCE_ID_REFSEQ = 6
+SPECIES_ID_ZEBRAFISH = 7955
+
+# Fixture to create a ZFINParser instance
+@pytest.fixture
+def zfin_parser() -> ZFINParser:
+    return ZFINParser(True)
+
+# Function to populate the database with ZFIN Desc, Uniprot, and RefSeq xrefs
+def populate_xref_db(mock_xref_dbi: DBConnection):
+    source_data = [
+        [SOURCE_ID_DESCRIPTION, 'ZFIN_ID', 10, 'description_only'],
+        [SOURCE_ID_DIRECT, 'ZFIN_ID', 1, 'direct'],
+        [SOURCE_ID_DEPENDENT, 'ZFIN_ID', 2, 'uniprot/refseq'],
+        [SOURCE_ID_UNIPROT, 'Uniprot/SWISSPROT', 20, ''],
+        [SOURCE_ID_REFSEQ, 'RefSeq_dna', 15, ''],
+    ]
+    for row in source_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO source (source_id, name, ordered, priority_description)
+                VALUES (:source_id, :name, :ordered, :priority_description)
+                """
+            ),
+            {
+                "source_id": row[0],
+                "name": row[1],
+                "ordered": row[2],
+                "priority_description": row[3]
+            }
+        )
+
+    xref_data = [
+        [1, 'ZDB-GENE-000125-4', SOURCE_ID_DESCRIPTION, SPECIES_ID_ZEBRAFISH, 'MISC', 'deltaC'],
+        [2, 'ZDB-GENE-000201-9', SOURCE_ID_DESCRIPTION, SPECIES_ID_ZEBRAFISH, 'MISC', 'anosmin 1a'],
+        [3, 'ZDB-GENE-000128-18', SOURCE_ID_DESCRIPTION, SPECIES_ID_ZEBRAFISH, 'MISC', 'anoctamin 1'],
+        [4, 'A0A8M9PP76', SOURCE_ID_UNIPROT, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''],
+        [5, 'B2GNV2', SOURCE_ID_UNIPROT, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''],
+        [6, 'Q9PTU1', SOURCE_ID_UNIPROT, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''],
+        [7, 'NP_571533', SOURCE_ID_REFSEQ, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''],
+        [8, 'NM_131458', SOURCE_ID_REFSEQ, SPECIES_ID_ZEBRAFISH, 'SEQUENCE_MATCH', ''],
+    ]
+    for row in xref_data:
+        mock_xref_dbi.execute(
+            text(
+                """
+                INSERT INTO xref (xref_id, accession, source_id, species_id, info_type, description)
+                VALUES (:xref_id, :accession, :source_id, :species_id, :info_type, :description)
+                """
+            ),
+            {
+                "xref_id": row[0],
+                "accession": row[1],
+                "source_id": row[2],
+                "species_id": row[3],
+                "info_type": row[4],
+                "description": row[5]
+            }
+        )
+
+    mock_xref_dbi.commit()
+
+# Function to run and validate the parsing process
+def run_and_validate_parsing(zfin_parser: ZFINParser, mock_xref_dbi: DBConnection, expected_direct_xrefs: int, expected_uniprot_xrefs: int, expected_refseq_xref: int, expected_mismatch: int, expected_synonyms: int, prefix: str = None) -> None:
+    if prefix is None:
+        prefix = ""
+
+    result_code, result_message = zfin_parser.run(
+        {
+            "source_id": SOURCE_ID_ZFIN,
+            "species_id": SPECIES_ID_ZEBRAFISH,
+            "file": "parsers/flatfiles/zfin/dummy_file.txt",
+            "xref_dbi": mock_xref_dbi,
+        }
+    )
+
+    assert result_code == 0, f"{prefix}Errors when parsing ZFIN data"
+    assert (
+        f"{expected_direct_xrefs} direct ZFIN xrefs added and" in result_message
+    ), f"{prefix}Expected '{expected_direct_xrefs} direct ZFIN xrefs added and' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_uniprot_xrefs} dependent xrefs from UniProt added" in result_message
+    ), f"{prefix}Expected '{expected_uniprot_xrefs} dependent xrefs from UniProt added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_refseq_xref} dependent xrefs from RefSeq added" in result_message
+    ), f"{prefix}Expected '{expected_refseq_xref} dependent xrefs from RefSeq added' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_mismatch} dependents ignored" in result_message
+    ), f"{prefix}Expected '{expected_mismatch} dependents ignored' in result_message, but got: '{result_message}'"
+    assert (
+        f"{expected_synonyms} synonyms loaded" in result_message
+    ), f"{prefix}Expected '{expected_synonyms} synonyms loaded' in result_message, but got: '{result_message}'"
+
+# Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
+def test_zfin_no_source_id(zfin_parser: ZFINParser, test_no_source_id: Callable[[ZFINParser, int], None]) -> None:
+    test_no_source_id(zfin_parser, SPECIES_ID_ZEBRAFISH)
+
+def test_zfin_no_species_id(zfin_parser: ZFINParser, test_no_species_id: Callable[[ZFINParser, int], None]) -> None:
+    test_no_species_id(zfin_parser, SOURCE_ID_ZFIN)
+
+def test_zfin_no_file(zfin_parser: ZFINParser, test_no_file: Callable[[ZFINParser, int, int], None]) -> None:
+    test_no_file(zfin_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_zfin_missing_required_source_id(zfin_parser: ZFINParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[ZFINParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(zfin_parser, mock_xref_dbi, 'ZFIN_ID', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH, 'direct')
+
+# Test case to check if an error is raised when the file is not found
+def test_zfin_file_not_found(zfin_parser: ZFINParser, test_file_not_found: Callable[[ZFINParser, int, int], None]) -> None:
+    test_file_not_found(zfin_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+
+# Test case to check if an error is raised when the file is empty
+def test_dbass_empty_file(zfin_parser: ZFINParser, test_empty_file: Callable[[ZFINParser, str, int, int], None]) -> None:
+    test_empty_file(zfin_parser, 'ZFIN Ensembl', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+
+# Test case to check successful parsing
+def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_parser: ZFINParser) -> None:
+    populate_xref_db(mock_xref_dbi)
+
+    # Check the row counts in the xref before running the parser
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='MISC' AND source_id={SOURCE_ID_DESCRIPTION}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_UNIPROT}")
+    check_row_count(mock_xref_dbi, "xref", 2, f"info_type='SEQUENCE_MATCH' AND source_id={SOURCE_ID_REFSEQ}")
+
+    # Run and validate parsing for ZFIN files
+    run_and_validate_parsing(zfin_parser, mock_xref_dbi, 10, 3, 2, 9, 5)
+
+    # Check the row counts in the xref, dependent_xref, and synonym tables
+    check_row_count(mock_xref_dbi, "xref", 10, f"info_type='DIRECT' AND source_id={SOURCE_ID_DIRECT}")
+    check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DEPENDENT' AND source_id={SOURCE_ID_DEPENDENT}")
+    check_row_count(mock_xref_dbi, "dependent_xref", 5)
+    check_row_count(mock_xref_dbi, "synonym", 7)
+
+    # Check the link between an xref and gene_direct_xref
+    check_direct_xref_link(mock_xref_dbi, "gene", "ZDB-GENE-000125-4", "ENSDARG00000002336")
+
+    # Check the link between an xref and dependent_xref
+    check_dependent_xref_link(mock_xref_dbi, "ZDB-GENE-000128-18", 5)
+    check_dependent_xref_link(mock_xref_dbi, "ZDB-GENE-000128-18", 6)
+    check_dependent_xref_link(mock_xref_dbi, "ZDB-GENE-000201-96", 7)
+
+    # Check the synonyms for specific accessions
+    check_synonym(mock_xref_dbi, "ZDB-GENE-000125-12", SOURCE_ID_DIRECT, "Df(LG03)")
+    check_synonym(mock_xref_dbi, "ZDB-GENE-000128-18", SOURCE_ID_DEPENDENT, "Tg(NBT:MAPT-GFP)")
+
+    # Check the descriptions for specific accessions
+    check_description(mock_xref_dbi, "ZDB-GENE-000125-4", "deltaC")
+    check_description(mock_xref_dbi, "ZDB-GENE-000201-9", "anosmin 1a")
+    check_description(mock_xref_dbi, "ZDB-GENE-000128-18", "anoctamin 1")
diff --git a/src/python/test/xrefs/pytest.ini b/src/python/test/xrefs/pytest.ini
new file mode 100644
index 000000000..b79469489
--- /dev/null
+++ b/src/python/test/xrefs/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --verbose --tb=line
\ No newline at end of file
diff --git a/src/python/test/xrefs/test_helpers.py b/src/python/test/xrefs/test_helpers.py
new file mode 100644
index 000000000..efe1e35d4
--- /dev/null
+++ b/src/python/test/xrefs/test_helpers.py
@@ -0,0 +1,80 @@
+from sqlalchemy import text
+
+from ensembl.utils.database import UnitTestDB, DBConnection
+
+# Helper function to check the row count in a specific table
+def check_row_count(db: DBConnection, table: str, expected_count: int, where_clause: str = None) -> None:
+    sql = f"SELECT COUNT(*) FROM {table}"
+    if where_clause is not None:
+        sql += f" WHERE {where_clause}"
+
+    row_count = db.execute(text(sql)).scalar()
+    assert (
+        row_count == expected_count
+    ), f"Expected {expected_count} rows in {table} table (WHERE: {where_clause or ''}), but got {row_count}"
+
+# Helper function to check the synonym for a specific accession
+def check_synonym(db: DBConnection, accession: str, source_id: int, expected_synonym: str) -> None:
+    synonym = db.execute(
+        text(
+            f"SELECT s.synonym FROM synonym s, xref x WHERE s.xref_id=x.xref_id AND x.accession='{accession}' AND x.source_id={source_id} AND s.synonym='{expected_synonym}'"
+        )
+    ).scalar()
+    assert (
+        synonym == expected_synonym
+    ), f"Expected synonym '{expected_synonym}' for accession '{accession}', but got '{synonym}'"
+
+# Helper function to check the direct xref connection for a specific accession
+def check_direct_xref_link(db: DBConnection, type: str, accession: str, expected_stable_id: str) -> None:
+    stable_id = db.execute(
+        text(
+            f"SELECT d.ensembl_stable_id FROM {type}_direct_xref d, xref x WHERE d.general_xref_id=x.xref_id AND x.accession='{accession}' AND d.ensembl_stable_id='{expected_stable_id}'"
+        )
+    ).scalar()
+    assert (
+        stable_id == expected_stable_id
+    ), f"Expected link between accession '{accession}' and EnsEMBL stable ID '{expected_stable_id}', but got '{stable_id}'"
+
+# Helper function to check the dependent xref connection for a specific accession
+def check_dependent_xref_link(db: DBConnection, accession: str, expected_master_xref_id: str) -> None:
+    master_xref_id = db.execute(
+        text(
+            f"SELECT d.master_xref_id FROM dependent_xref d, xref x WHERE d.dependent_xref_id=x.xref_id AND x.accession='{accession}' AND d.master_xref_id={expected_master_xref_id}"
+        )
+    ).scalar()
+    assert (
+        master_xref_id == expected_master_xref_id
+    ), f"Expected link between accession '{accession}' and master xref ID '{expected_master_xref_id}', but got '{master_xref_id}'"
+
+# Helper function to check the sequence for a specific accession
+def check_sequence(db: DBConnection, accession: str, source_id: int, expected_sequence: str) -> None:
+    sequence = db.execute(
+        text(
+            f"SELECT p.sequence FROM primary_xref p, xref x WHERE p.xref_id=x.xref_id AND x.accession='{accession}' AND x.source_id={source_id}"
+        )
+    ).scalar()
+    assert (
+        sequence == expected_sequence
+    ), f"Expected sequence '{expected_sequence}' for accession '{accession}', but got '{sequence}'"
+
+# Helper function to check the description for a specific accession
+def check_description(db: DBConnection, accession: str, expected_description: str) -> None:
+    description = db.execute(
+        text(
+            f"SELECT description FROM xref WHERE accession='{accession}'"
+        )
+    ).scalar()
+    assert (
+        description == expected_description
+    ), f"Expected description '{expected_description}' for accession '{accession}', but got '{description}'"
+
+# Helper function to check the release info for a specific source_id
+def check_release(db: DBConnection, source_id: str, expected_release: str) -> None:
+    release = db.execute(
+        text(
+            f"SELECT source_release FROM source WHERE source_id={source_id}"
+        )
+    ).scalar()
+    assert (
+        release == expected_release
+    ), f"Expected release info '{expected_release}' for source_id {source_id}, but got '{release}'"
\ No newline at end of file

From 182f2513350478f81b6b019185ee5df23d5f0cc1 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Fri, 8 Nov 2024 13:26:39 +0000
Subject: [PATCH 03/12] New tests and fixes

---
 src/python/test/xrefs/conftest.py             | 150 +++++---
 .../xrefs/flatfiles/RNACentral-md5.tsv.gz     | Bin 0 -> 475 bytes
 .../test/xrefs/flatfiles/UniParc-upidump.lis  |  10 +
 src/python/test/xrefs/flatfiles/config.ini    |  88 +++++
 src/python/test/xrefs/flatfiles/peptides.fa   | 200 ++++++++++
 src/python/test/xrefs/flatfiles/sources.json  |  16 +
 .../xrefs/parsers/test_arrayexpress_parser.py |   8 +-
 .../test/xrefs/parsers/test_ccds_parser.py    |   8 +-
 .../test/xrefs/parsers/test_dbass_parser.py   |  12 +-
 .../xrefs/parsers/test_entrezgene_parser.py   |  16 +-
 .../test/xrefs/parsers/test_hgnc_parser.py    |  19 +-
 .../test/xrefs/parsers/test_hpa_parser.py     |  12 +-
 .../xrefs/parsers/test_jgi_protein_parser.py  |  12 +-
 .../xrefs/parsers/test_mgi_desc_parser.py     |  12 +-
 .../test/xrefs/parsers/test_mgi_parser.py     |  12 +-
 .../xrefs/parsers/test_mim2gene_parser.py     |  12 +-
 .../test/xrefs/parsers/test_mim_parser.py     |  16 +-
 .../test/xrefs/parsers/test_mirbase_parser.py |  12 +-
 .../xrefs/parsers/test_reactome_parser.py     |  27 +-
 .../test/xrefs/parsers/test_refseq_parser.py  |  14 +-
 .../test/xrefs/parsers/test_rfam_parser.py    |  12 +-
 .../test/xrefs/parsers/test_rgd_parser.py     |  14 +-
 .../test/xrefs/parsers/test_ucsc_parser.py    |  12 +-
 .../test/xrefs/parsers/test_uniprot_parser.py |  14 +-
 .../test/xrefs/parsers/test_vgnc_parser.py    |  12 +-
 .../parsers/test_xenopus_jamboree_parser.py   |  12 +-
 .../xrefs/parsers/test_zfin_desc_parser.py    |  22 +-
 .../test/xrefs/parsers/test_zfin_parser.py    |  12 +-
 src/python/test/xrefs/pytest.ini              |   3 +
 src/python/test/xrefs/test_checksum.py        | 104 +++++
 src/python/test/xrefs/test_download_source.py | 118 ++++++
 src/python/test/xrefs/test_helpers.py         |  25 +-
 .../test/xrefs/test_schedule_alignment.py     | 106 ++++++
 .../test/xrefs/test_schedule_cleanup.py       | 116 ++++++
 .../test/xrefs/test_schedule_download.py      | 116 ++++++
 src/python/test/xrefs/test_schedule_parse.py  | 224 +++++++++++
 .../test/xrefs/test_schedule_species.py       | 358 ++++++++++++++++++
 37 files changed, 1694 insertions(+), 242 deletions(-)
 create mode 100644 src/python/test/xrefs/flatfiles/RNACentral-md5.tsv.gz
 create mode 100644 src/python/test/xrefs/flatfiles/UniParc-upidump.lis
 create mode 100644 src/python/test/xrefs/flatfiles/config.ini
 create mode 100644 src/python/test/xrefs/flatfiles/peptides.fa
 create mode 100644 src/python/test/xrefs/flatfiles/sources.json
 create mode 100644 src/python/test/xrefs/test_checksum.py
 create mode 100644 src/python/test/xrefs/test_download_source.py
 create mode 100644 src/python/test/xrefs/test_schedule_alignment.py
 create mode 100644 src/python/test/xrefs/test_schedule_cleanup.py
 create mode 100644 src/python/test/xrefs/test_schedule_download.py
 create mode 100644 src/python/test/xrefs/test_schedule_parse.py
 create mode 100644 src/python/test/xrefs/test_schedule_species.py

diff --git a/src/python/test/xrefs/conftest.py b/src/python/test/xrefs/conftest.py
index 36b690013..e1067414e 100644
--- a/src/python/test/xrefs/conftest.py
+++ b/src/python/test/xrefs/conftest.py
@@ -1,83 +1,103 @@
 import pytest
-import os
 import io
 import re
+import os
+import importlib
 
 from datetime import datetime
 from unittest.mock import MagicMock
-from typing import Any, Generator, Callable
+from typing import Any, Generator, Callable, Dict
 
 from ensembl.utils.database import UnitTestDB, DBConnection
-from ensembl.xrefs.xref_update_db_model import Base
+from ensembl.xrefs.xref_update_db_model import Base as BaseUpdateORM
+from ensembl.xrefs.xref_source_db_model import Base as BaseSourceORM
 from ensembl.production.xrefs.parsers.BaseParser import BaseParser
 
-# Fixture to set up a test database
+# Adding custom command-line options to pytest
+def pytest_addoption(parser):
+    parser.addoption(
+        "--test_db_url",
+        action="store",
+        default=os.getenv("TEST_DB_URL"),
+        help="MySQL URL to use for the test databases",
+    )
+    parser.addoption(
+        "--test_scratch_path",
+        action="store",
+        default=os.getenv("TEST_SCRATCH_PATH"),
+        help="Path to a scratch directory to use for temporary files",
+    )
+
+# Fixture to set up a xref test database
 @pytest.fixture(scope="module")
-def test_db() -> Generator[None, None, None]:
-    # Create a unique database name using the current user and timestamp
-    user = os.environ.get("USER", "testuser")
+def test_xref_db(pytestconfig: pytest.Config) -> Generator[UnitTestDB, None, None]:
+    # Retrieve the test DB URL
+    test_db_url = pytestconfig.getoption("test_db_url")
+    if not test_db_url:
+        raise ValueError(f"DB URL for test database must be provided")
+
+    # Create a unique database name using the timestamp
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    db_name = f"{user}_test_xref_{timestamp}"
-    mysql_url = f"mysql+pymysql://ensadmin:ensembl@mysql-ens-core-prod-1.ebi.ac.uk:4524/{db_name}"
+    db_name = f"test_xref_update_{timestamp}"
+    full_test_db_url = f"{test_db_url}/{db_name}"
 
     # Create all tables defined in the Base metadata
-    with UnitTestDB(mysql_url, metadata=Base.metadata, name=db_name) as test_db:
+    with UnitTestDB(full_test_db_url, metadata=BaseUpdateORM.metadata, name=db_name) as test_db:
         yield test_db
 
-# Fixture to connect to the test database and close connection when done
+# Fixture to connect to the xref test database and close connection when done
 @pytest.fixture
-def mock_xref_dbi(test_db: UnitTestDB) -> Generator[Any, None, None]:
-    conn = test_db.dbc.connect()
+def mock_xref_dbi(test_xref_db) -> Generator[Any, None, None]:
+    conn = test_xref_db.dbc.connect()
     yield conn
     conn.close()
 
-# Common test for missing source_id
-@pytest.fixture
-def test_no_source_id() -> Callable[[BaseParser, int], None]:
-    def _test_no_source_id(parser_instance: BaseParser, species_id: int = 9606) -> None:
-        with pytest.raises(
-            AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?"
-        ):
-            parser_instance.run(
-                {
-                    "species_id": species_id,
-                    "file": "dummy_file.txt",
-                    "xref_dbi": MagicMock(),
-                }
-            )
-    return _test_no_source_id
+# Fixture to set up a source test database
+@pytest.fixture(scope="module")
+def test_source_db(pytestconfig: pytest.Config) -> Generator[UnitTestDB, None, None]:
+    # Retrieve the test DB URL
+    test_db_url = pytestconfig.getoption("test_db_url")
+    if not test_db_url:
+        raise ValueError(f"DB URL for test database must be provided")
+
+    # Create a unique database name using the timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    db_name = f"test_xref_source_{timestamp}"
+    full_test_db_url = f"{test_db_url}/{db_name}"
+
+    # Create all tables defined in the Base metadata
+    with UnitTestDB(full_test_db_url, metadata=BaseSourceORM.metadata, name=db_name) as test_db:
+        yield test_db
 
-# Common test for missing species_id
+# Fixture to connect to the source test database and close connection when done
 @pytest.fixture
-def test_no_species_id() -> Callable[[BaseParser, int], None]:
-    def _test_no_species_id(parser_instance: BaseParser, source_id: int = 1) -> None:
-        with pytest.raises(
-            AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?"
-        ):
-            parser_instance.run(
-                {
-                    "source_id": source_id,
-                    "file": "dummy_file.txt",
-                    "xref_dbi": MagicMock(),
-                }
-            )
-    return _test_no_species_id
+def mock_source_dbi(test_source_db) -> Generator[Any, None, None]:
+    conn = test_source_db.dbc.connect()
+    yield conn
+    conn.close()
+
+# @pytest.fixture
+# def mock_source_db_url(test_source_db):
+#     return test_source_db.dbc.url
 
-# Common test for missing file
+# Common test for missing argument
 @pytest.fixture
-def test_no_file() -> Callable[[BaseParser, int, int], None]:
-    def _test_no_file(parser_instance: BaseParser, source_id: int = 1, species_id: int = 9606) -> None:
+def test_parser_missing_argument() -> Callable[[BaseParser, str, int, int], None]:
+    def _test_parser_missing_argument(parser_instance: BaseParser, arg_name: str, source_id: int = 1, species_id: int = 9606) -> None:
+        parser_args = {
+            "source_id": source_id,
+            "species_id": species_id,
+            "file": "dummy_file.txt",
+            "xref_dbi": MagicMock(),
+        }
+        if arg_name in parser_args:
+            del parser_args[arg_name]
+
         with pytest.raises(
-            AttributeError, match="Missing required arguments: source_id, species_id, and file"
+            AttributeError, match=r"Missing required arguments: source_id(,| and) species_id(, and file)?"
         ):
-            parser_instance.run(
-                {
-                    "source_id": source_id,
-                    "species_id": species_id,
-                    "xref_dbi": MagicMock(),
-                }
-            )
-    return _test_no_file
+            parser_instance.run(parser_args)
+    return _test_parser_missing_argument
 
 # Common test for file not found
 @pytest.fixture
@@ -132,4 +152,24 @@ def _test_missing_required_source_id(parser_instance: BaseParser, mock_dbi: DBCo
                     "xref_dbi": mock_dbi,
                 }
             )
-    return _test_missing_required_source_id
\ No newline at end of file
+    return _test_missing_required_source_id
+
+# Common test for missing required parameter
+@pytest.fixture
+def test_missing_required_param() -> Callable[[str, Dict[str, Any], str], None]:
+    def _test_missing_required_param(module_name: str, args: Dict[str, Any], param_name: str) -> None:
+        # Remove the param name being tested from the args
+        current_args = args.copy()
+        if param_name in current_args:
+            del current_args[param_name]
+
+        # Import the module and create an instance
+        module = importlib.import_module(f"ensembl.production.xrefs.{module_name}")
+        module_class = getattr(module, module_name)
+        module_object = module_class(current_args, True, True)
+
+        with pytest.raises(
+            AttributeError, match=f"Parameter '{param_name}' is required but has no value"
+        ):
+            module_object.run()
+    return _test_missing_required_param
\ No newline at end of file
diff --git a/src/python/test/xrefs/flatfiles/RNACentral-md5.tsv.gz b/src/python/test/xrefs/flatfiles/RNACentral-md5.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..1057790692375462aa40c6527c4357bcf2c19b90
GIT binary patch
literal 475
zcmV<10VMt(iwFpfg(GJG15!>wLuGDsa$#&OZDchrbaQq9Rg>G1)-Vi4->G6t(9M$k
z2rLu;vjteP)qe?y;SaOU<RRn9$#;?R@#{A{7rwo{X~cqQ>B;l#xuA9dk?}d0HrJPr
zPXIN5b*SKQ@}jNVz*kN`Gb(V~@(bVwECO-EFsf=bAtB3SA_HO#l?#vtfFaG}K*=OG
zia7}9Mxz6%<5n0CFh_8V2p1X}P-k%>oWtwRpbroVQk0t<&|HJ8pMolFUx2+@LR;-0
zEEb)al(2ZhS}MEr0{o@m=ngA4R$HOrE@!eVn@sZp=1YOlch^GHFvgHA4iH@cY`cK3
zPpzdxb*!kR6ly%_NbP|>#|3<QS(@6GW>4C=R`mCK?CC|j$8rJRA4_~Pv1aYwjv>w}
zcse|ux)t7EmPnP-ZO78-G*K~Eu}J8S3;6M}w7O6jT2ukCRr@4&kGyl=wf=mTZ>@sS
zHEs~zE-AV@Nu4(M89;b4bp38N0%vKjwkxfWnizcT65{_1d2$#_hGc4_ba8`W0$sa=
z^ki`3G6Fgy<S17w?bEIGeF6M00|RHjpU8wQVn@bbgJLk_nghx!gTyAT{u#Ve%UNR)
Rl;^=P{{s)`A>7jg003C4-VXo(

literal 0
HcmV?d00001

diff --git a/src/python/test/xrefs/flatfiles/UniParc-upidump.lis b/src/python/test/xrefs/flatfiles/UniParc-upidump.lis
new file mode 100644
index 000000000..eb5e9b96e
--- /dev/null
+++ b/src/python/test/xrefs/flatfiles/UniParc-upidump.lis
@@ -0,0 +1,10 @@
+UPI00018273A1 0346D0CAE142F3B4BEAB03C043F946C2
+UPI00159B34E5 3F21C95F3E901F0CFF99DD2B7AB9E0FB
+UPI001BFFF51E AA7F616AF8BBF601C1119851CD1E7D81
+UPI000BB13401 65B4E5BA3C7D523EE05D11D7D93246C7
+UPI00058EFAFA E209A7008D4E2643C34CAA4659995FB9
+UPI000012AB42 E933F38C54E844615DF09466B8372C27
+UPI0010249251 376E4A5FA9F5E0161E03F3168BFC91A8
+UPI0001F8B5D5 6B2B77E4162A054DA364AB5D0F4DFC3B
+UPI000CA31F98 F6466B0375E6205467BE0F78FB708040
+UPI000809885D 972EED30B388FBFF1F446ABDC2CD57CD
\ No newline at end of file
diff --git a/src/python/test/xrefs/flatfiles/config.ini b/src/python/test/xrefs/flatfiles/config.ini
new file mode 100644
index 000000000..1bc1619e6
--- /dev/null
+++ b/src/python/test/xrefs/flatfiles/config.ini
@@ -0,0 +1,88 @@
+[source ArrayExpress::MULTI]
+name            = ArrayExpress
+order           = 50
+priority        = 1
+prio_descr      = multi
+parser          = ArrayExpressParser
+
+[source UniParc::MULTI]
+name        = UniParc
+order       = 20
+priority    = 1
+prio_descr      = multi
+parser      = ChecksumParser
+
+[source DBASS3::homo_sapiens]
+name            = DBASS3
+order           = 50
+priority        = 1
+prio_descr      = human
+parser          = DBASSParser
+
+[source MIM::homo_sapiens]
+name            = MIM
+order           = 10
+priority        = 1
+prio_descr      = human
+parser          = MIMParser
+
+[source Reactome::MULTI]
+name            = Reactome
+order           = 80
+priority        = 1
+prio_descr      = multi
+parser          = ReactomeParser
+
+[source RefSeq_dna::MULTI-vertebrate]
+name            = RefSeq_dna
+order           = 15
+priority        = 2
+prio_descr      = verts
+parser          = RefSeqParser
+
+[source RefSeq_dna::gencode]
+name            = RefSeq_dna
+order           = 15
+priority        = 2
+prio_descr      = human
+parser          = RefSeqParser
+
+[source RefSeq_dna::MULTI-Plants]
+name            = RefSeq_dna
+order           = 15
+priority        = 2
+prio_descr      = plants
+parser          = RefSeqParser
+
+[source RefSeq_peptide::gencode]
+name            = RefSeq_peptide
+order           = 30
+priority        = 2
+prio_descr      = human
+parser          = RefSeqParser
+
+[source RefSeq_peptide::MULTI-Plants]
+name            = RefSeq_peptide
+order           = 25
+priority        = 2
+prio_descr      = plants
+parser          = RefSeqParser
+
+[source RefSeq_peptide::MULTI-vertebrate]
+name            = RefSeq_peptide
+order           = 25
+priority        = 2
+prio_descr      = verts
+parser          = RefSeqParser
+
+[species vertebrates]
+taxonomy_id     = 7742
+sources         = ArrayExpress::MULTI,UniParc::MULTI,Reactome::MULTI,RefSeq_dna::MULTI-vertebrate,RefSeq_peptide::MULTI-vertebrate
+
+[species homo_sapiens]
+taxonomy_id     = 9606
+sources         = DBASS3::homo_sapiens,MIM::homo_sapiens,RefSeq_dna::gencode,RefSeq_peptide::gencode
+
+[species plants]
+taxonomy_id = 33090
+sources         = ArrayExpress::MULTI,UniParc::MULTI,Reactome::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_peptide::MULTI-Plants
diff --git a/src/python/test/xrefs/flatfiles/peptides.fa b/src/python/test/xrefs/flatfiles/peptides.fa
new file mode 100644
index 000000000..210faceec
--- /dev/null
+++ b/src/python/test/xrefs/flatfiles/peptides.fa
@@ -0,0 +1,200 @@
+>1
+MFMINILMLIIPILLAVAFLTLVERKVLGYMQLRKGPNVVGPYGLLQPIADAIKLFIKEP
+LRPATSSASMFILAPIMALGLALTMWIPLPMPYPLINMNLGVLFMLAMSSLAVYSILWSG
+WASNSKYALIGALRAVAQTISYEVTLAIILLSVVLMSGSFTLSTLITTQEQMWLILPAWP
+LAMMWFISTLAETNRAPFDLTEGESELVSGFNVEYAAGPFALFFMAEYANIIMMNIFTAI
+LFLGTSHNPHMPELYTINFTIKSLLLTMSFLWIRASYPRFRYDQLMHLLWKNFLPLTLAL
+CMWHVSLPILTSGIPPQT
+>2
+MNPIIFIIILLTIMLGTIIVMISSHWLLVWIGFEMNMLAIIPIMMKNHNPRATEASTKYF
+LTQSTASMLLMMAVIINVMFSGQWTVMKLFSPMASMLMTMALAMKLGMAPFHFWVPEVTQ
+GIPLSSGLILLTWQKLAPMSVLYQIFPSINLNLILTLSVLSILIGGWGGLNQTQLRKIMA
+YSSIAHMGWMTAVLPYNPTMTLLNLIIYIIMTSTMFTMFMANSTTTTLSLSHTWNKTPIM
+TVLILATLLSMGGLPPLSGFMPKWMIIQEMTKNNSIILPTFMAITALLNLYFYMRLTYST
+TLTMFPSTNNMKMKWQFPLMKKMTFLPTMVVLSTMMLPLTPMLSVLE
+>3
+MFINRWLFSTSHKDIGTLYLLFDAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNAVVTA
+HAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEA
+GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMSQYQ
+TPLFVWSVMITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH
+PEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVD
+TRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMMWALGFIFLFTVGGLTGIVLAN
+SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFVHWFPLFSGYTLNDTWAKIHFAIMFVG
+VNMTFFPQHFLGLSGMPRRYSDYPDAYTMWNTISSMGSFISLTAVMLMVFIIWEAFASKR
+EVLTVDLTTTNLEWLNGCPPPYHTFEEPTYVNLK
+>4
+MAYPMQLGFQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
+VETIWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLSFDS
+YMIPTSELKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLN
+QTTLMSSRPGLYYGQCSEICGSNHSFMPIVLELVPLKYFEKWSASML
+>5
+MPQLDTSTWLTMILSMFLTLFIIFQLKVSKHNFYHNPELTPTKMLKQNTPWETKWTKIYL
+PLLLPL
+>6
+MNENLFTSFITPVILGLPLVTLIVLFPSLLFPTSNRLVSNRFVTLQQWMLQLVSKQMMSI
+HNSKGQTWTLMLMSLILFIGSTNLLGLLPHSFTPTTQLSMNLGMAIPLWAGAVITGFRNK
+TKASLAHFLPQGTPTPLIPMLVIIETISLFIQPMALAVRLTANITAGHLLIHLIGGATLA
+LMSISTTTALITFTILILLTILEFAVAMIQAYVFTLLVSLYLHDNT
+>7
+MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIGLTTNMLTMYQWWRD
+VIRESTFQGHHTPAVQKGLRYGMILFIISEVLFFTGFFWAFYHSSLAPTPELGGCWPPTG
+IHPLNPLEVPLLNTSVLLASGVSITWAHHSLMEGDRKHMLQALFITITLGVYFTLLQASE
+YYEAPFTISDGVYGSTFFVATGFHGLHVIIGSTFLIVCFFRQLKFHFTSNHHFGFEAAAW
+YWHFVDVVWLFLYVSIYWWG
+>8
+MNLMLALLTNFTLATLLVIIAFWLPQLNVYSEKTSPYECGFDPMGSARLPFSMKFFLVAI
+TFLLFDLEIALLLPLPWASQTANLNTMLTMALFLIILLAVSLAYEWTQKGLEWTE
+>9
+MSMVYMNIMMAFTVSLVGLLMYRSHLMSSLLCLEGMMLSLFVMAALTILNSHFTLASMMP
+IILLVFAACEAALGLSLLVMVSNTYGTDYVQNLNLLQC
+>10
+MLKYIIPTIMLMPLTWLSKNNMIWVNSTAHSLLISFTSLLLMNQFGDNSLNFSLLFFSDS
+LSTPLLILTMWLLPLMLMASQHHLSKENLTRKKLFITMLISLQLFLIMTFTAMELILFYI
+LFEATLVPTLIIITRWGNQTERLNAGLYFLFYTLAGSLPLLVALIYIQNTVGSLNFLMLQ
+YWVQPVHNSWSNVFMWLACMMAFMVKMPLYGLHLWLPKAHVEAPIAGSMVLAAVLLKLGG
+YGMLRITLILNPMTDFMAYPFIMLSLWGMIMTSSICLRQTDLKSLIAYSSVSHMALVIVA
+ILIQTPWSYMGATALMIAHGLTSSMLFCLANSNYERIHSRTMILARGLQTLLPLMATWWL
+LASLTNLALPPTINLIGELFVVMSTFSWSNITIILMGVNMVITALYSLYMLIMTQRGKYT
+YHINNISPSFTRENALMSLHILPLLLLTLNPKIILGPLY
+>11
+MNMFSSLSLVTLLLLTTPIMMMSFNTYKPSNYPLYVKTAISYAFITSMIPTMMFIHSGQE
+LIISNWHWLTIQTLKLSLSFKMDYFSMMFIPVALFVTWSIMEFSMWYMYSDPNINKFFKY
+LLLFLITMLILVTANNLFQLFIGWEGVGIMSFLLIGWWYGRADANTAALQAILYNRIGDI
+GFILAMAWFLTNLNTWDLQQIFMLNPSDSNMPLIGLALAATGKSAQFGLHPWLPSAMEGP
+TPVSALLHSSTMVVAGIFLLIRFYPLTENNKYIQSITLCLGAITTLFTAMCALTQNDIKK
+IIAFSTSSQLGLMMVTIGINQPYLAFLHICTHAFFKAMLFMCSGSIIHSLNDEQDIRKMG
+GLFKAMPFTTTALIVGSLALTGMPFLTGFYSKDLIIEAANTSYTNAWALLMTLIATSFTA
+IYSTRIIFFALLGQPRFPTLVNINENNPLLINSIKRLLIGSLFAGYIISNNIPPTTIPQM
+TMPYYLKTTALIVTILGFILALEISNMTKNLKYHYPSNAFKFSTLLGYFPTIMHRLAPYM
+NLSMSQKSASSLLDLIWLEAILPKTISLAQMKASTLVTNQKGLIKLYFLSFLITILISMI
+LFNFHE
+>12
+MMLYIVFILSVIFVMGFVGFSSKPSPIYGGLGLIVSGGVGCGIVLNFGGSFLGLMVFLIY
+LGGMMVVFGYTTAMATEQYPEIWLSNKAVLGAFVTGLLMEFFMVYYVLKDKEVEVVFEFN
+GLGDWVIYDTGDSGFFSEEAMGIAALYSYGTWLVIVTGWSLLIGVVVIMEITRGN
+>13
+MTNIRKSHPLMKIVNNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTT
+TAFSSVTHICRDVNYGWIIRYMHANGASMFFICLYMHVGRGLYYGSYTFLETWNIGVILL
+LTVMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTNLVEWIWGGFSVDKATLTRFFA
+FHFILPFIIMAIAMVHLLFLHETGSNNPTGISSDVDKIPFHPYYTIKDILGALLLILALM
+LLVLFAPDLLGDPDNYTPANPLNTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAFSILI
+LALIPLLHTSKQRSMMFRPLSQCLFWALVADLLTLTWIGGQPVEHPYITIGQLASVLYFL
+LILVLMPTAGTIENKLLKW
+>14
+MKDFLGWLERFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQL
+NLEGKNYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLK
+NADLCIASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEF
+TVRDGYIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTER
+MYLCLSQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVP
+VVESLQLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGW
+RWVRQPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNT
+NSEGSYTNVSTNSTSVTSSTATVVS
+>15
+MREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVYLMGSGWKKKKEQMERDG
+CSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSDKRKHFMLSVKMFYGNSD
+DIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLRSQTVSTRYLHVEGGNFH
+ASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSVTGMALPRLIIRKVDKQT
+ALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQATPCPKEPNKEMINDGASWTII
+STDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVAMLELTGQNFTPNLRVWFGDVEA
+ETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPR
+PHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNSTSVTSSTATVVS
+>16
+MDQMEGSPAEEPPAHAPSLGKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSY
+GNEKRFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGK
+NYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLC
+IASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDG
+YIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCL
+SQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESL
+QLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQ
+PVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGS
+YTNVSTNSTSVTSSTATVVS
+>17
+MNEKGWELKGAGSHLENTHLRRARPKTRITGALPMDQMEGSPAEEPPAHAPSLGKFGERP
+PPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVYLMGSGWKKKKEQ
+MERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSDKRKHFMLSVKMF
+YGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLRSQTVSTRYLHVE
+GGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSVTGMALPRLIIRK
+VDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQATPCPKEPNKEMINDGA
+SWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVAMLELTGQNFTPNLRVWF
+GDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLVRNDGIIYSTSLTFTYTP
+EPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNSTSVTSSTATVVS
+>18
+MLHRLAPGTPSGVSTRRQTLRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKS
+YGNEKRFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEG
+KNYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADL
+CIASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRD
+GYIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLC
+LSQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVES
+LQLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVR
+QPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEG
+SYTNVSTNSTSVTSSTATVVS
+>19
+MAWIKRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVY
+LMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSD
+KRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLR
+SQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSV
+TGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQATPCP
+KEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVAMLELT
+GQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLVRNDGI
+IYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNSTSVTSS
+TATVVS
+>20
+MAWIKRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVY
+LMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSD
+KRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLDDDESEGEEFTVRDG
+YIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCL
+SQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESL
+QLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQ
+PVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGS
+YTNVSTNSTSVTSSTATVVS
+>21
+MAWIKRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCPPPCVY
+LMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLYISDSD
+KRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVALFNRLR
+SQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVKLVCSV
+TGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQLNGGG
+DVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPV
+TLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVST
+NSTSVTSSTATVVS
+>22
+MEGCLPTHHTLPEKHLYAHWLLQRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVA
+QKSYGNEKRFFCPPPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLN
+LEGKNYCTAKTLYISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKN
+ADLCIASGTKVALFNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFT
+VRDGYIHYGQTVKLVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERM
+YLCLSQERIIQFQATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPV
+VESLQLNGGGDVAMLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWR
+WVRQPVQVPVTLVRNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTN
+SEGSYTNVSTNSTSVTSSTATVVS
+>23
+MIGLLYPALSRKFGERPPPKRLTREAMRNYLKERGDQTVLILHAKVAQKSYGNEKRFFCP
+PPCVYLMGSGWKKKKEQMERDGCSEQESQPCAFIGIGNSDQEMQQLNLEGKNYCTAKTLY
+ISDSDKRKHFMLSVKMFYGNSDDIGVFLSKRIKVISKPSKKKQSLKNADLCIASGTKVAL
+FNRLRSQTVSTRYLHVEGGNFHASSQQWGAFYIHLLDDDESEGEEFTVRDGYIHYGQTVK
+LVCSVTGMALPRLIIRKVDKQTALLDADDPVSQLHKCAFYLKDTERMYLCLSQERIIQFQ
+ATPCPKEPNKEMINDGASWTIISTDKAEYTFYEGMGPVLAPVTPVPVVESLQLNGGGDVA
+MLELTGQNFTPNLRVWFGDVEAETMYRCGESMLCVVPDISAFREGWRWVRQPVQVPVTLV
+RNDGIIYSTSLTFTYTPEPGPRPHCSAAGAILRANSSQVPPNESNTNSEGSYTNVSTNST
+SVTSSTATVVS
+>24
+MDVVDSLLMNESNLTPPCELGIENETLFCLDQPHPSKEWQPAVQILLYSLIFLLSVLGNT
+LVITVLIRNKRMRTVTNIFLLSLAVSDLMLCLFCMPFNLIPNLLKDFIFGSAVCKTTTYF
+MGTSVSVSTFNLVAISLERYGAICKPLQSRVWQTKSHALKVIAATWCLSFTIMTPYPIYS
+NLVPFTKNNNQTANMCRFLLPSDVMQQSWHTFLLLILFLIPGIVMMVAYGLISLELYQGI
+KFDASQKKSARERKRSSASSGRYPHGQEARHPHAHGHRGPLLPVLDAHLQRQRLEGL
+>25
+MDVVDSLLMNESNLTPPCELGIENETLFCLDQPHPSKEWQPAVQILLYSLIFLLSVLGNT
+LVITVLIRNKRMRTVTNIFLLSLAVSDLMLCLFCMPFNLIPNLLKDFIFGSAVCKTTTYF
+MGTSVSVSTFNLVAISLERYGAICKPLQSRVWQTKSHALKVIAATWCLSFTIMTPYPIYS
+NLVPFTKNNNQTANMCRFLLPSDVMQQSWHTFLLLILFLIPGIVMMVAYGLISLELYQGI
+KFDASQKKSARERKRSSASSGRYADSAGCCLQRPKHPRKLELRQLSTGSAGRADRIRSSS
+PAASLMAKKRVIRMLMVIVVLFFLCWMPIFSANAWRAFDTASAERRLSGTPIAFILLLSY
+TSSCVNPIIYCFMNKRIVEAALRLRSPSLFQEHSVTTHLTMTTDGNRKQTLFWPFSVLQT
+SRSKGEL
+>26
+MLQEESDLSLIIAQIVQKLKGSNLYAQLERQAWASLQRPEIKLESLKEDIKEFFKISGWE
+KKLQNAVYSELSVFPLPSHPAAPPEHLKEPLVYMRKAQGSWEKRILKSLNSMCTELSIPL
+ARKRPVGEQKELLNKWNEMGTDEPDLSLFRPVYAPKDFLEVLINLRNPNYESGDSLSFRT
+HLGLIQVPLKVKDIPELKEFFVELGLTTGQLGIDDSTQVPPELFENEHVRIGQKVLTQQD
+SAAAQQYIRQGSPTALRAELWALILNISSHPEDILYYEQLKTNVIQHDLLVDSLIYKDVK
+LTASNDDYYFVFEDYLYQVLLCFSRDTSVLSHFAYNSASPPKSYIRGKLGLEEYAVF
diff --git a/src/python/test/xrefs/flatfiles/sources.json b/src/python/test/xrefs/flatfiles/sources.json
new file mode 100644
index 000000000..1b45a2acb
--- /dev/null
+++ b/src/python/test/xrefs/flatfiles/sources.json
@@ -0,0 +1,16 @@
+[
+    {
+      "name" : "ArrayExpress",
+      "parser" : "ArrayExpressParser",
+      "file" : "Database",
+      "db" : "core",
+      "priority" : 1
+    },
+    {
+      "name" : "RNACentral",
+      "parser" : "ChecksumParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz",
+      "db" : "checksum",
+      "priority" : 1
+    }
+]
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_arrayexpress_parser.py b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py
index db0379e08..b73fd037c 100644
--- a/src/python/test/xrefs/parsers/test_arrayexpress_parser.py
+++ b/src/python/test/xrefs/parsers/test_arrayexpress_parser.py
@@ -37,11 +37,9 @@ def run_and_validate_parsing(arrayexpress_parser: ArrayExpressParser, mock_xref_
     ), f"{prefix}Expected 'Added {expected_xrefs} DIRECT xrefs' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id and species_id
-def test_arrayexpress_no_source_id(arrayexpress_parser: ArrayExpressParser, test_no_source_id: Callable[[ArrayExpressParser, int], None]) -> None:
-    test_no_source_id(arrayexpress_parser, SPECIES_ID_HUMAN)
-
-def test_arrayexpress_no_species_id(arrayexpress_parser: ArrayExpressParser, test_no_species_id: Callable[[ArrayExpressParser, int], None]) -> None:
-    test_no_species_id(arrayexpress_parser, SOURCE_ID_ARRAYEXPRESS)
+def test_arrayexpress_missing_argument(arrayexpress_parser: ArrayExpressParser, test_parser_missing_argument: Callable[[ArrayExpressParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(arrayexpress_parser, "source_id", SOURCE_ID_ARRAYEXPRESS, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(arrayexpress_parser, "species_id", SOURCE_ID_ARRAYEXPRESS, SPECIES_ID_HUMAN)
 
 # Test case to check if parsing is skipped when no species name can be found
 def test_no_species_name(mock_xref_dbi: DBConnection, arrayexpress_parser: ArrayExpressParser) -> None:
diff --git a/src/python/test/xrefs/parsers/test_ccds_parser.py b/src/python/test/xrefs/parsers/test_ccds_parser.py
index 1f7fe93e9..4b22225ef 100644
--- a/src/python/test/xrefs/parsers/test_ccds_parser.py
+++ b/src/python/test/xrefs/parsers/test_ccds_parser.py
@@ -36,11 +36,9 @@ def run_and_validate_parsing(ccds_parser: CCDSParser, mock_xref_dbi: DBConnectio
     ), f"{prefix}Expected 'Parsed CCDS identifiers, added {expected_xrefs} xrefs and {expected_direct_xrefs} direct_xrefs' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id and species_id
-def test_ccds_no_source_id(ccds_parser: CCDSParser, test_no_source_id: Callable[[CCDSParser, int], None]) -> None:
-    test_no_source_id(ccds_parser, SPECIES_ID_HUMAN)
-
-def test_ccds_no_species_id(ccds_parser: CCDSParser, test_no_species_id: Callable[[CCDSParser, int], None]) -> None:
-    test_no_species_id(ccds_parser, SOURCE_ID_CCDS)
+def test_ccds_missing_argument(ccds_parser: CCDSParser, test_parser_missing_argument: Callable[[CCDSParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(ccds_parser, "source_id", SOURCE_ID_CCDS, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(ccds_parser, "species_id", SOURCE_ID_CCDS, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when no CCDS database is provided
 def test_no_ccds_db(ccds_parser: CCDSParser) -> None:
diff --git a/src/python/test/xrefs/parsers/test_dbass_parser.py b/src/python/test/xrefs/parsers/test_dbass_parser.py
index c6ec23967..8b19caf55 100644
--- a/src/python/test/xrefs/parsers/test_dbass_parser.py
+++ b/src/python/test/xrefs/parsers/test_dbass_parser.py
@@ -41,14 +41,10 @@ def run_and_validate_parsing(dbass_parser: DBASSParser, mock_xref_dbi: DBConnect
     ), f"{prefix}Expected 'Skipped {expected_skipped_xrefs} unmapped xrefs' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_dbass_no_source_id(dbass_parser: DBASSParser, test_no_source_id: Callable[[DBASSParser, int], None]) -> None:
-    test_no_source_id(dbass_parser, SPECIES_ID_HUMAN)
-
-def test_dbass_no_species_id(dbass_parser: DBASSParser, test_no_species_id: Callable[[DBASSParser, int], None]) -> None:
-    test_no_species_id(dbass_parser, SOURCE_ID_DBASS3)
-
-def test_dbass_no_file(dbass_parser: DBASSParser, test_no_file: Callable[[DBASSParser, int, int], None]) -> None:
-    test_no_file(dbass_parser, SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
+def test_dbass_missing_argument(dbass_parser: DBASSParser, test_parser_missing_argument: Callable[[DBASSParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(dbass_parser, "source_id", SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(dbass_parser, "species_id", SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(dbass_parser, "file", SOURCE_ID_DBASS3, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the file is not found
 def test_dbass_file_not_found(dbass_parser: DBASSParser, test_file_not_found: Callable[[DBASSParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_entrezgene_parser.py b/src/python/test/xrefs/parsers/test_entrezgene_parser.py
index f0d31f8ae..32709b79a 100644
--- a/src/python/test/xrefs/parsers/test_entrezgene_parser.py
+++ b/src/python/test/xrefs/parsers/test_entrezgene_parser.py
@@ -38,14 +38,10 @@ def run_and_validate_parsing(entrezgene_parser: EntrezGeneParser, mock_xref_dbi:
     ), f"{prefix}Expected '{expected_entrez_xrefs} EntrezGene Xrefs and {expected_wiki_xrefs} WikiGene Xrefs added with {expected_synonyms} synonyms' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_entrezgene_no_source_id(entrezgene_parser: EntrezGeneParser, test_no_source_id: Callable[[EntrezGeneParser, int], None]) -> None:
-    test_no_source_id(entrezgene_parser, SPECIES_ID_HUMAN)
-
-def test_entrezgene_no_species_id(entrezgene_parser: EntrezGeneParser, test_no_species_id: Callable[[EntrezGeneParser, int], None]) -> None:
-    test_no_species_id(entrezgene_parser, SOURCE_ID_ENTREZGENE)
-
-def test_entrezgene_no_file(entrezgene_parser: EntrezGeneParser, test_no_file: Callable[[EntrezGeneParser, int, int], None]) -> None:
-    test_no_file(entrezgene_parser, SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+def test_entrezgene_missing_argument(entrezgene_parser: EntrezGeneParser, test_parser_missing_argument: Callable[[EntrezGeneParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(entrezgene_parser, "source_id", SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(entrezgene_parser, "species_id", SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(entrezgene_parser, "file", SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the file is not found
 def test_entrezgene_file_not_found(entrezgene_parser: EntrezGeneParser, test_file_not_found: Callable[[EntrezGeneParser, int, int], None]) -> None:
@@ -111,6 +107,10 @@ def test_malformed_headers(entrezgene_parser: EntrezGeneParser, header: str) ->
             }
         )
 
+# Test case to check if an error is raised when the required source_id is missing
+def test_entrezgene_missing_required_source_id(entrezgene_parser: EntrezGeneParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[EntrezGeneParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(entrezgene_parser, mock_xref_dbi, 'WikiGene', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN)
+
 # Test case to check if an error is raised when the file has insufficient columns
 def test_insufficient_columns(entrezgene_parser: EntrezGeneParser) -> None:
     mock_file = io.StringIO()
diff --git a/src/python/test/xrefs/parsers/test_hgnc_parser.py b/src/python/test/xrefs/parsers/test_hgnc_parser.py
index 7f920779f..f09bae291 100644
--- a/src/python/test/xrefs/parsers/test_hgnc_parser.py
+++ b/src/python/test/xrefs/parsers/test_hgnc_parser.py
@@ -66,14 +66,15 @@ def run_and_validate_parsing(hgnc_parser: HGNCParser, mock_xref_dbi: DBConnectio
     ), f"{prefix}Expected '{expected_mismatch} HGNC ids could not be associated in xrefs' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_hgnc_no_source_id(hgnc_parser: HGNCParser, test_no_source_id: Callable[[HGNCParser, int], None]) -> None:
-    test_no_source_id(hgnc_parser, SPECIES_ID_HUMAN)
+def test_hgnc_missing_argument(hgnc_parser: HGNCParser, test_parser_missing_argument: Callable[[HGNCParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(hgnc_parser, "source_id", SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(hgnc_parser, "species_id", SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(hgnc_parser, "file", SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
 
-def test_hgnc_no_species_id(hgnc_parser: HGNCParser, test_no_species_id: Callable[[HGNCParser, int], None]) -> None:
-    test_no_species_id(hgnc_parser, SOURCE_ID_HGNC)
-
-def test_hgnc_no_file(hgnc_parser: HGNCParser, test_no_file: Callable[[HGNCParser, int, int], None]) -> None:
-    test_no_file(hgnc_parser, SOURCE_ID_HGNC, SPECIES_ID_HUMAN)
+# Test case to check if an error is raised when the required source_id is missing
+def test_hgnc_missing_required_source_id(hgnc_parser: HGNCParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[HGNCParser, DBConnection, str, int, int, str], None]) -> None:
+    hgnc_parser.get_source_name_for_source_id = MagicMock(return_value="HGNC")
+    test_missing_required_source_id(hgnc_parser, mock_xref_dbi, 'HGNC', SOURCE_ID_ENTREZGENE, SPECIES_ID_HUMAN, 'ccds')
 
 # Test case to check if an error is raised when no CCDS database is provided
 def test_no_ccds_db(hgnc_parser: HGNCParser) -> None:
@@ -106,7 +107,7 @@ def test_successful_parsing_without_existing_xrefs(mock_xref_dbi: DBConnection,
     hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
     hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url")
     hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={})
-    hgnc_parser.get_valid_codes = MagicMock(return_value={})
+    hgnc_parser.get_acc_to_xref_ids = MagicMock(return_value={})
     hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={})
 
     # Run and validate parsing for HGNC file
@@ -132,7 +133,7 @@ def test_successful_parsing_with_existing_xrefs(mock_xref_dbi: DBConnection, hgn
     hgnc_parser.get_source_id_for_source_name = MagicMock(side_effect=mock_get_source_id_for_source_name)
     hgnc_parser.construct_db_url = MagicMock(return_value="dummy_db_url")
     hgnc_parser.get_ccds_to_ens_mapping = MagicMock(return_value={"CCDS12976": "CCDS12976", "CCDS8856": "CCDS8856", "CCDS53797": "CCDS53797"})
-    hgnc_parser.get_valid_codes = MagicMock(return_value={"NM_130786": [12], "NR_026971": [34, 56], "NR_015380": [78], "NM_001088": [90]})
+    hgnc_parser.get_acc_to_xref_ids = MagicMock(return_value={"NM_130786": [12], "NR_026971": [34, 56], "NR_015380": [78], "NM_001088": [90]})
     hgnc_parser.get_valid_xrefs_for_dependencies = MagicMock(return_value={"503538": 123, "441376": 456, "51146": 789})
 
     # Run and validate parsing for HGNC file
diff --git a/src/python/test/xrefs/parsers/test_hpa_parser.py b/src/python/test/xrefs/parsers/test_hpa_parser.py
index 838a3756e..04b860382 100644
--- a/src/python/test/xrefs/parsers/test_hpa_parser.py
+++ b/src/python/test/xrefs/parsers/test_hpa_parser.py
@@ -37,14 +37,10 @@ def run_and_validate_parsing(hpa_parser: HPAParser, mock_xref_dbi: DBConnection,
     ), f"{prefix}Expected '{expected_xrefs} direct xrefs successfully parsed' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_hpa_no_source_id(hpa_parser: HPAParser, test_no_source_id: Callable[[HPAParser, int], None]) -> None:
-    test_no_source_id(hpa_parser, SPECIES_ID_HUMAN)
-
-def test_hpa_no_species_id(hpa_parser: HPAParser, test_no_species_id: Callable[[HPAParser, int], None]) -> None:
-    test_no_species_id(hpa_parser, SOURCE_ID_HPA)
-
-def test_hpa_no_file(hpa_parser: HPAParser, test_no_file: Callable[[HPAParser, int, int], None]) -> None:
-    test_no_file(hpa_parser, SOURCE_ID_HPA, SPECIES_ID_HUMAN)
+def test_hpa_missing_argument(hpa_parser: HPAParser, test_parser_missing_argument: Callable[[HPAParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(hpa_parser, "source_id", SOURCE_ID_HPA, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(hpa_parser, "species_id", SOURCE_ID_HPA, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(hpa_parser, "file", SOURCE_ID_HPA, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the file is not found
 def test_hpa_file_not_found(hpa_parser: HPAParser, test_file_not_found: Callable[[HPAParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_jgi_protein_parser.py b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py
index 666e6fa95..d936b7da9 100644
--- a/src/python/test/xrefs/parsers/test_jgi_protein_parser.py
+++ b/src/python/test/xrefs/parsers/test_jgi_protein_parser.py
@@ -34,14 +34,10 @@ def run_and_validate_parsing(jgi_protein_parser: JGI_ProteinParser, mock_xref_db
     assert f"{expected_xrefs} JGI_ xrefs successfully parsed" in result_message, f"{prefix}Expected '{expected_xrefs} JGI_ xrefs successfully parsed' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_jgi_no_source_id(jgi_protein_parser: JGI_ProteinParser, test_no_source_id: Callable[[JGI_ProteinParser, int], None]) -> None:
-    test_no_source_id(jgi_protein_parser, SPECIES_ID_C_INTESTINALIS)
-
-def test_jgi_no_species_id(jgi_protein_parser: JGI_ProteinParser, test_no_species_id: Callable[[JGI_ProteinParser, int], None]) -> None:
-    test_no_species_id(jgi_protein_parser, SOURCE_ID_JGI)
-
-def test_jgi_no_file(jgi_protein_parser: JGI_ProteinParser, test_no_file: Callable[[JGI_ProteinParser, int, int], None]) -> None:
-    test_no_file(jgi_protein_parser, SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
+def test_jgi_missing_argument(jgi_protein_parser: JGI_ProteinParser, test_parser_missing_argument: Callable[[JGI_ProteinParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(jgi_protein_parser, "source_id", SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
+    test_parser_missing_argument(jgi_protein_parser, "species_id", SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
+    test_parser_missing_argument(jgi_protein_parser, "file", SOURCE_ID_JGI, SPECIES_ID_C_INTESTINALIS)
 
 # Test case to check if an error is raised when the file is not found
 def test_jgi_file_not_found(jgi_protein_parser: JGI_ProteinParser, test_file_not_found: Callable[[JGI_ProteinParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_mgi_desc_parser.py b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py
index 02b46352b..cedf77c3e 100644
--- a/src/python/test/xrefs/parsers/test_mgi_desc_parser.py
+++ b/src/python/test/xrefs/parsers/test_mgi_desc_parser.py
@@ -40,14 +40,10 @@ def run_and_validate_parsing(mgi_desc_parser: MGIDescParser, mock_xref_dbi: DBCo
     ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_mgi_desc_no_source_id(mgi_desc_parser: MGIDescParser, test_no_source_id: Callable[[MGIDescParser, int], None]) -> None:
-    test_no_source_id(mgi_desc_parser, SPECIES_ID_MOUSE)
-
-def test_mgi_desc_no_species_id(mgi_desc_parser: MGIDescParser, test_no_species_id: Callable[[MGIDescParser, int], None]) -> None:
-    test_no_species_id(mgi_desc_parser, SOURCE_ID_MGI_DESC)
-
-def test_mgi_desc_no_file(mgi_desc_parser: MGIDescParser, test_no_file: Callable[[MGIDescParser, int, int], None]) -> None:
-    test_no_file(mgi_desc_parser, SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
+def test_mgi_desc_missing_argument(mgi_desc_parser: MGIDescParser, test_parser_missing_argument: Callable[[MGIDescParser, DBConnection, str, int, int, str], None]) -> None:
+    test_parser_missing_argument(mgi_desc_parser, "source_id", SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
+    test_parser_missing_argument(mgi_desc_parser, "species_id", SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
+    test_parser_missing_argument(mgi_desc_parser, "file", SOURCE_ID_MGI_DESC, SPECIES_ID_MOUSE)
 
 # Test case to check if an error is raised when the file is not found
 def test_mgi_desc_file_not_found(mgi_desc_parser: MGIDescParser, test_file_not_found: Callable[[MGIDescParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_mgi_parser.py b/src/python/test/xrefs/parsers/test_mgi_parser.py
index fab933d60..5897556cc 100644
--- a/src/python/test/xrefs/parsers/test_mgi_parser.py
+++ b/src/python/test/xrefs/parsers/test_mgi_parser.py
@@ -38,14 +38,10 @@ def run_and_validate_parsing(mgi_parser: MGIParser, mock_xref_dbi: DBConnection,
     ), f"{prefix}Expected '{expected_synonyms} synonyms added' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_mgi_no_source_id(mgi_parser: MGIParser, test_no_source_id: Callable[[MGIParser, int], None]) -> None:
-    test_no_source_id(mgi_parser, SPECIES_ID_MOUSE)
-
-def test_mgi_no_species_id(mgi_parser: MGIParser, test_no_species_id: Callable[[MGIParser, int], None]) -> None:
-    test_no_species_id(mgi_parser, SOURCE_ID_MGI)
-
-def test_mgi_no_file(mgi_parser: MGIParser, test_no_file: Callable[[MGIParser, int, int], None]) -> None:
-    test_no_file(mgi_parser, SOURCE_ID_MGI, SPECIES_ID_MOUSE)
+def test_mgi_missing_argument(mgi_parser: MGIParser, test_parser_missing_argument: Callable[[MGIParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(mgi_parser, "source_id", SOURCE_ID_MGI, SPECIES_ID_MOUSE)
+    test_parser_missing_argument(mgi_parser, "species_id", SOURCE_ID_MGI, SPECIES_ID_MOUSE)
+    test_parser_missing_argument(mgi_parser, "file", SOURCE_ID_MGI, SPECIES_ID_MOUSE)
 
 # Test case to check if an error is raised when the file is not found
 def test_mgi_file_not_found(mgi_parser: MGIParser, test_file_not_found: Callable[[MGIParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_mim2gene_parser.py b/src/python/test/xrefs/parsers/test_mim2gene_parser.py
index 590c1c3bc..8dc2d284a 100644
--- a/src/python/test/xrefs/parsers/test_mim2gene_parser.py
+++ b/src/python/test/xrefs/parsers/test_mim2gene_parser.py
@@ -115,14 +115,10 @@ def run_and_validate_parsing(mim2gene_parser: Mim2GeneParser, mock_xref_dbi: DBC
     ), f"{prefix}Expected '{expected_missed_master} had missing master entries' in result message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_mim2gene_no_source_id(mim2gene_parser: Mim2GeneParser, test_no_source_id: Callable[[Mim2GeneParser, int], None]) -> None:
-    test_no_source_id(mim2gene_parser, SPECIES_ID_HUMAN)
-
-def test_mim2gene_no_species_id(mim2gene_parser: Mim2GeneParser, test_no_species_id: Callable[[Mim2GeneParser, int], None]) -> None:
-    test_no_species_id(mim2gene_parser, SOURCE_ID_MIM2GENE)
-
-def test_mim2gene_no_file(mim2gene_parser: Mim2GeneParser, test_no_file: Callable[[Mim2GeneParser, int, int], None]) -> None:
-    test_no_file(mim2gene_parser, SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+def test_mim2gene_missing_argument(mim2gene_parser: Mim2GeneParser, test_parser_missing_argument: Callable[[Mim2GeneParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(mim2gene_parser, "source_id", SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(mim2gene_parser, "species_id", SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(mim2gene_parser, "file", SOURCE_ID_MIM2GENE, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the file is not found
 def test_mim2gene_file_not_found(mim2gene_parser: Mim2GeneParser, test_file_not_found: Callable[[Mim2GeneParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_mim_parser.py b/src/python/test/xrefs/parsers/test_mim_parser.py
index 676c182bf..ce5b4c187 100644
--- a/src/python/test/xrefs/parsers/test_mim_parser.py
+++ b/src/python/test/xrefs/parsers/test_mim_parser.py
@@ -52,14 +52,14 @@ def run_and_validate_parsing(mim_parser: MIMParser, mock_xref_dbi: DBConnection,
     ), f"{prefix}Expected '{expected_removed_entries} entries removed' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_mim_no_source_id(mim_parser: MIMParser, test_no_source_id: Callable[[MIMParser, int], None]) -> None:
-    test_no_source_id(mim_parser, SPECIES_ID_HUMAN)
-
-def test_mim_no_species_id(mim_parser: MIMParser, test_no_species_id: Callable[[MIMParser, int], None]) -> None:
-    test_no_species_id(mim_parser, SOURCE_ID_MIM)
-
-def test_mim_no_file(mim_parser: MIMParser, test_no_file: Callable[[MIMParser, int, int], None]) -> None:
-    test_no_file(mim_parser, SOURCE_ID_MIM, SPECIES_ID_HUMAN)
+def test_mim_missing_argument(mim_parser: MIMParser, test_parser_missing_argument: Callable[[MIMParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(mim_parser, "source_id", SOURCE_ID_MIM, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(mim_parser, "species_id", SOURCE_ID_MIM, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(mim_parser, "file", SOURCE_ID_MIM, SPECIES_ID_HUMAN)
+
+# Test case to check if an error is raised when the required source_id is missing
+def test_mim_missing_required_source_id(mim_parser: MIMParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[MIMParser, DBConnection, str, int, int, str], None]) -> None:
+    test_missing_required_source_id(mim_parser, mock_xref_dbi, 'MIM_GENE', SOURCE_ID_MIM, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the file is not found
 def test_mim_file_not_found(mim_parser: MIMParser, test_file_not_found: Callable[[MIMParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_mirbase_parser.py b/src/python/test/xrefs/parsers/test_mirbase_parser.py
index f9c426c3a..61ae37028 100644
--- a/src/python/test/xrefs/parsers/test_mirbase_parser.py
+++ b/src/python/test/xrefs/parsers/test_mirbase_parser.py
@@ -39,14 +39,10 @@ def run_and_validate_parsing(mirbase_parser: miRBaseParser, mock_xref_dbi: DBCon
     ), f"{prefix}Expected 'Read {expected_xrefs} xrefs from' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_mirbase_no_source_id(mirbase_parser: miRBaseParser, test_no_source_id: Callable[[miRBaseParser, int], None]) -> None:
-    test_no_source_id(mirbase_parser, SPECIES_ID_C_ELEGANS)
-
-def test_mirbase_no_species_id(mirbase_parser: miRBaseParser, test_no_species_id: Callable[[miRBaseParser, int], None]) -> None:
-    test_no_species_id(mirbase_parser, SOURCE_ID_MIRBASE)
-
-def test_mirbase_no_file(mirbase_parser: miRBaseParser, test_no_file: Callable[[miRBaseParser, int, int], None]) -> None:
-    test_no_file(mirbase_parser, SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS)
+def test_mirbase_missing_argument(mirbase_parser: miRBaseParser, test_parser_missing_argument: Callable[[miRBaseParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(mirbase_parser, "source_id", SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS)
+    test_parser_missing_argument(mirbase_parser, "species_id", SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS)
+    test_parser_missing_argument(mirbase_parser, "file", SOURCE_ID_MIRBASE, SPECIES_ID_C_ELEGANS)
 
 # Test case to check if an error is raised when the file is not found
 def test_mirbase_file_not_found(mirbase_parser: miRBaseParser, test_file_not_found: Callable[[miRBaseParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_reactome_parser.py b/src/python/test/xrefs/parsers/test_reactome_parser.py
index 9187fde0e..92e18e0f1 100644
--- a/src/python/test/xrefs/parsers/test_reactome_parser.py
+++ b/src/python/test/xrefs/parsers/test_reactome_parser.py
@@ -46,6 +46,8 @@ def populate_xref_db(mock_xref_dbi: DBConnection):
             }
         )
 
+    mock_xref_dbi.commit()
+
 # Function to run and validate the parsing process
 def run_and_validate_parsing(reactome_parser: ReactomeParser, mock_xref_dbi: DBConnection, file: str, expected_processed: int, expected_dependent: int, expected_direct: int, expected_errors: int, prefix: str = None) -> None:
     if prefix is None:
@@ -77,14 +79,10 @@ def run_and_validate_parsing(reactome_parser: ReactomeParser, mock_xref_dbi: DBC
     ), f"{prefix}Expected '{expected_errors} not found' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_reactome_no_source_id(reactome_parser: ReactomeParser, test_no_source_id: Callable[[ReactomeParser, int], None]) -> None:
-    test_no_source_id(reactome_parser, SPECIES_ID_HUMAN)
-
-def test_reactome_no_species_id(reactome_parser: ReactomeParser, test_no_species_id: Callable[[ReactomeParser, int], None]) -> None:
-    test_no_species_id(reactome_parser, SOURCE_ID_REACTOME)
-
-def test_reactome_no_file(reactome_parser: ReactomeParser, test_no_file: Callable[[ReactomeParser, int, int], None]) -> None:
-    test_no_file(reactome_parser, SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+def test_reactome_missing_argument(reactome_parser: ReactomeParser, test_parser_missing_argument: Callable[[ReactomeParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(reactome_parser, "source_id", SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(reactome_parser, "species_id", SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(reactome_parser, "file", SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
 
 # Test case to check if parsing is skipped when no species name can be found
 def test_no_species_name(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
@@ -117,31 +115,30 @@ def test_reactome_empty_file(reactome_parser: ReactomeParser, test_empty_file: C
     reactome_parser.species_id_to_names = MagicMock(return_value={SPECIES_ID_HUMAN: [SPECIES_NAME_HUMAN]})
     test_empty_file(reactome_parser, 'Reactome', SOURCE_ID_REACTOME, SPECIES_ID_HUMAN)
 
-# Test case to check successful parsing of valid Reactome data without existing uniprot xrefs
-def test_successful_parsing_without_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
+# Test case to check successful parsing of valid Reactome data
+def test_successful_parsing(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
     populate_xref_db(mock_xref_dbi)
 
     # Run and validate parsing for Uniprot and Ensembl Reactome files
     run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 0, 0, 0)
     run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_ensembl", 14, 0, 13, 1)
 
-    # Check the row counts in the xref and direct_xref tables
+    # Check the row counts in the xref, direct_xref, and dependent_xref tables
     check_row_count(mock_xref_dbi, "xref", 6, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_GENE}")
     check_row_count(mock_xref_dbi, "xref", 4, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_TRANSCRIPT}")
     check_row_count(mock_xref_dbi, "xref", 3, f"info_type='DIRECT' AND source_id={SOURCE_ID_REACTOME_DIRECT}")
     check_row_count(mock_xref_dbi, "gene_direct_xref", 6)
     check_row_count(mock_xref_dbi, "transcript_direct_xref", 4)
     check_row_count(mock_xref_dbi, "translation_direct_xref", 3)
+    check_row_count(mock_xref_dbi, "dependent_xref", 0)
 
     # Check the link between an xref and direct_xref tables
     check_direct_xref_link(mock_xref_dbi, "gene", "R-HSA-1643685", "ENSG00000000419")
     check_direct_xref_link(mock_xref_dbi, "transcript", "R-HSA-199991", "ENST00000000233")
     check_direct_xref_link(mock_xref_dbi, "translation", "R-HSA-199991", "ENSP00000000233")
 
-# Test case to check successful parsing of valid Reactome data with existing uniprot xrefs
-def test_successful_parsing_with_existing_uniprot(mock_xref_dbi: DBConnection, reactome_parser: ReactomeParser) -> None:
-    populate_xref_db(mock_xref_dbi)
-    reactome_parser.get_valid_codes = MagicMock(return_value={"A0A075B6P5": [12], "A0A075B6S6" : [34, 56], "A0A087WPF7": [78], "A0A096LNF2": [90]})
+    # Add uniptot xrefs
+    reactome_parser.get_acc_to_xref_ids = MagicMock(return_value={"A0A075B6P5": [12], "A0A075B6S6" : [34, 56], "A0A087WPF7": [78], "A0A096LNF2": [90]})
  
     # Run and validate re-parsing for Uniprot and Ensembl Reactome files
     run_and_validate_parsing(reactome_parser, mock_xref_dbi, "reactome_UniProt", 8, 6, 0, 0, "Re-parsing: ")
diff --git a/src/python/test/xrefs/parsers/test_refseq_parser.py b/src/python/test/xrefs/parsers/test_refseq_parser.py
index 2b8a77f2c..6f9c5f1f1 100644
--- a/src/python/test/xrefs/parsers/test_refseq_parser.py
+++ b/src/python/test/xrefs/parsers/test_refseq_parser.py
@@ -142,17 +142,13 @@ def run_and_validate_parsing(refseq_parser: RefSeqParser, mock_xref_dbi: DBConne
     ), f"{prefix}Expected 'WikiGene\t{wiki}' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_refseq_no_source_id(refseq_parser: RefSeqParser, test_no_source_id: Callable[[RefSeqParser, int], None]) -> None:
-    test_no_source_id(refseq_parser, SPECIES_ID_HUMAN)
-
-def test_refseq_no_species_id(refseq_parser: RefSeqParser, test_no_species_id: Callable[[RefSeqParser, int], None]) -> None:
-    test_no_species_id(refseq_parser, SOURCE_ID_REFSEQ_MRNA)
-
-def test_refseq_no_file(refseq_parser: RefSeqParser, test_no_file: Callable[[RefSeqParser, int, int], None]) -> None:
-    test_no_file(refseq_parser, SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
+def test_refseq_missing_argument(refseq_parser: RefSeqParser, test_parser_missing_argument: Callable[[RefSeqParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(refseq_parser, "source_id", SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(refseq_parser, "species_id", SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(refseq_parser, "file", SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the required source_id is missing
-def test_mim2gene_missing_required_source_id(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RefSeqParser, DBConnection, str, int, int, str], None]) -> None:
+def test_refseq_missing_required_source_id(refseq_parser: RefSeqParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[RefSeqParser, DBConnection, str, int, int, str], None]) -> None:
     test_missing_required_source_id(refseq_parser, mock_xref_dbi, 'RefSeq_peptide', SOURCE_ID_REFSEQ, SPECIES_ID_HUMAN)
 
 # Test case to check if parsing is skipped when no species name can be found
diff --git a/src/python/test/xrefs/parsers/test_rfam_parser.py b/src/python/test/xrefs/parsers/test_rfam_parser.py
index 86caa9669..5d9c780ba 100644
--- a/src/python/test/xrefs/parsers/test_rfam_parser.py
+++ b/src/python/test/xrefs/parsers/test_rfam_parser.py
@@ -37,14 +37,10 @@ def run_and_validate_parsing(rfam_parser: RFAMParser, mock_xref_dbi: DBConnectio
     ), f"{prefix}Expected 'Added {expected_xrefs} RFAM xrefs and {expected_direct_xrefs} direct xrefs' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_rfam_no_source_id(rfam_parser: RFAMParser, test_no_source_id: Callable[[RFAMParser, int], None]) -> None:
-    test_no_source_id(rfam_parser, SPECIES_ID_HUMAN)
-
-def test_rfam_no_species_id(rfam_parser: RFAMParser, test_no_species_id: Callable[[RFAMParser, int], None]) -> None:
-    test_no_species_id(rfam_parser, SOURCE_ID_RFAM)
-
-def test_rfam_no_file(rfam_parser: RFAMParser, test_no_file: Callable[[RFAMParser, int, int], None]) -> None:
-    test_no_file(rfam_parser, SOURCE_ID_RFAM, SPECIES_ID_HUMAN)
+def test_rfam_missing_argument(rfam_parser: RFAMParser, test_parser_missing_argument: Callable[[RFAMParser, int, int], None]) -> None:
+    test_parser_missing_argument(rfam_parser, "source_id", SOURCE_ID_RFAM, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(rfam_parser, "species_id", SOURCE_ID_RFAM, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(rfam_parser, "file", SOURCE_ID_RFAM, SPECIES_ID_HUMAN)
 
 # Test case to check if parsing is skipped when no species name can be found
 def test_no_species_name(mock_xref_dbi: DBConnection, rfam_parser: RFAMParser) -> None:
diff --git a/src/python/test/xrefs/parsers/test_rgd_parser.py b/src/python/test/xrefs/parsers/test_rgd_parser.py
index 2b8019c3f..d057b02f4 100644
--- a/src/python/test/xrefs/parsers/test_rgd_parser.py
+++ b/src/python/test/xrefs/parsers/test_rgd_parser.py
@@ -45,14 +45,10 @@ def run_and_validate_parsing(rgd_parser: RGDParser, mock_xref_dbi: DBConnection,
     ), f"{prefix}Expected 'Added {expected_synonyms} synonyms, including duplicates' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_rgd_no_source_id(rgd_parser: RGDParser, test_no_source_id: Callable[[RGDParser, int], None]) -> None:
-    test_no_source_id(rgd_parser, SPECIES_ID_RAT)
-
-def test_rgd_no_species_id(rgd_parser: RGDParser, test_no_species_id: Callable[[RGDParser, int], None]) -> None:
-    test_no_species_id(rgd_parser, SOURCE_ID_RGD)
-
-def test_rgd_no_file(rgd_parser: RGDParser, test_no_file: Callable[[RGDParser, int, int], None]) -> None:
-    test_no_file(rgd_parser, SOURCE_ID_RGD, SPECIES_ID_RAT)
+def test_rgd_missing_argument(rgd_parser: RGDParser, test_parser_missing_argument: Callable[[RGDParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(rgd_parser, "source_id", SOURCE_ID_RGD, SPECIES_ID_RAT)
+    test_parser_missing_argument(rgd_parser, "species_id", SOURCE_ID_RGD, SPECIES_ID_RAT)
+    test_parser_missing_argument(rgd_parser, "file", SOURCE_ID_RGD, SPECIES_ID_RAT)
 
 # Test case to check if an error is raised when the file is not found
 def test_rgd_file_not_found(rgd_parser: RGDParser, test_file_not_found: Callable[[RGDParser, int, int], None]) -> None:
@@ -89,7 +85,7 @@ def test_successful_parsing_without_refseqs(mock_xref_dbi: DBConnection, rgd_par
 # Test case to check successful parsing of valid RGD data with refseqs
 def test_successful_parsing_with_refseqs(mock_xref_dbi: DBConnection, rgd_parser: RGDParser) -> None:
     rgd_parser.get_source_id_for_source_name = MagicMock(return_value=SOURCE_ID_DIRECT)
-    rgd_parser.get_valid_codes = MagicMock(return_value={"NM_052979": [12, 34], "XM_039101774" : [56], "XM_063281326": [78]})
+    rgd_parser.get_acc_to_xref_ids = MagicMock(return_value={"NM_052979": [12, 34], "XM_039101774" : [56], "XM_063281326": [78]})
 
     # Run and validate parsing for RGD file with existing refseqs
     run_and_validate_parsing(rgd_parser, mock_xref_dbi, 3, 5, 1, 12)
diff --git a/src/python/test/xrefs/parsers/test_ucsc_parser.py b/src/python/test/xrefs/parsers/test_ucsc_parser.py
index ae96e4d3f..13e1dc071 100644
--- a/src/python/test/xrefs/parsers/test_ucsc_parser.py
+++ b/src/python/test/xrefs/parsers/test_ucsc_parser.py
@@ -36,14 +36,10 @@ def run_and_validate_parsing(ucsc_parser: UCSCParser, mock_xref_dbi: DBConnectio
     ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} UCSC xrefs' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_ucsc_no_source_id(ucsc_parser: UCSCParser, test_no_source_id: Callable[[UCSCParser, int], None]) -> None:
-    test_no_source_id(ucsc_parser, SPECIES_ID_HUMAN)
-
-def test_ucsc_no_species_id(ucsc_parser: UCSCParser, test_no_species_id: Callable[[UCSCParser, int], None]) -> None:
-    test_no_species_id(ucsc_parser, SOURCE_ID_UCSC)
-
-def test_ucsc_no_file(ucsc_parser: UCSCParser, test_no_file: Callable[[UCSCParser, int, int], None]) -> None:
-    test_no_file(ucsc_parser, SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
+def test_ucsc_missing_argument(ucsc_parser: UCSCParser, test_parser_missing_argument: Callable[[UCSCParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(ucsc_parser, "source_id", SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(ucsc_parser, "species_id", SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(ucsc_parser, "file", SOURCE_ID_UCSC, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the file is not found
 def test_ucsc_file_not_found(ucsc_parser: UCSCParser, test_file_not_found: Callable[[UCSCParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_uniprot_parser.py b/src/python/test/xrefs/parsers/test_uniprot_parser.py
index 0cf0e2cc7..c80337717 100644
--- a/src/python/test/xrefs/parsers/test_uniprot_parser.py
+++ b/src/python/test/xrefs/parsers/test_uniprot_parser.py
@@ -63,6 +63,8 @@ def populate_xref_db(mock_xref_dbi: DBConnection):
             }
         )
 
+    mock_xref_dbi.commit()
+
 # Function to run and validate the parsing process
 def run_and_validate_parsing(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, file:str, expected_xrefs: Dict[str, int], expected_deps: Dict[str, int], prefix: str = None) -> None:
     if prefix is None:
@@ -107,14 +109,10 @@ def run_and_validate_parsing(uniprot_parser: UniProtParser, mock_xref_dbi: DBCon
         assert f"{count_type}\t{count}" in result_message, f"{prefix}Expected '{count_type}\t{count}' in result_meesgae, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_uniprot_no_source_id(uniprot_parser: UniProtParser, test_no_source_id: Callable[[UniProtParser, int], None]) -> None:
-    test_no_source_id(uniprot_parser, SPECIES_ID_HUMAN)
-
-def test_uniprot_no_species_id(uniprot_parser: UniProtParser, test_no_species_id: Callable[[UniProtParser, int], None]) -> None:
-    test_no_species_id(uniprot_parser, SOURCE_ID_UNIPROT)
-
-def test_uniprot_no_file(uniprot_parser: UniProtParser, test_no_file: Callable[[UniProtParser, int, int], None]) -> None:
-    test_no_file(uniprot_parser, SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN)
+def test_uniprot_missing_argument(uniprot_parser: UniProtParser, test_parser_missing_argument: Callable[[UniProtParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(uniprot_parser, "source_id", SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(uniprot_parser, "species_id", SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN)
+    test_parser_missing_argument(uniprot_parser, "file", SOURCE_ID_UNIPROT, SPECIES_ID_HUMAN)
 
 # Test case to check if an error is raised when the required source_id is missing
 def test_uniprot_missing_required_source_id(uniprot_parser: UniProtParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[UniProtParser, DBConnection, str, int, int, str], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_vgnc_parser.py b/src/python/test/xrefs/parsers/test_vgnc_parser.py
index 6ebe58d8d..5fb2297bc 100644
--- a/src/python/test/xrefs/parsers/test_vgnc_parser.py
+++ b/src/python/test/xrefs/parsers/test_vgnc_parser.py
@@ -36,14 +36,10 @@ def run_and_validate_parsing(vgnc_parser: VGNCParser, mock_xref_dbi: DBConnectio
     ), f"{prefix}Expected 'Loaded a total of {expected_xrefs} VGNC xrefs and added {expected_synonyms} synonyms' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_vgnc_no_source_id(vgnc_parser: VGNCParser, test_no_source_id: Callable[[VGNCParser, int], None]) -> None:
-    test_no_source_id(vgnc_parser, SPECIES_ID_P_TROGLODYTES)
-
-def test_vgnc_no_species_id(vgnc_parser: VGNCParser, test_no_species_id: Callable[[VGNCParser, int], None]) -> None:
-    test_no_species_id(vgnc_parser, SOURCE_ID_VGNC)
-
-def test_vgnc_no_file(vgnc_parser: VGNCParser, test_no_file: Callable[[VGNCParser, int, int], None]) -> None:
-    test_no_file(vgnc_parser, SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
+def test_vgnc_missing_argument(vgnc_parser: VGNCParser, test_parser_missing_argument: Callable[[VGNCParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(vgnc_parser, "source_id", SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
+    test_parser_missing_argument(vgnc_parser, "species_id", SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
+    test_parser_missing_argument(vgnc_parser, "file", SOURCE_ID_VGNC, SPECIES_ID_P_TROGLODYTES)
 
 # Test case to check if an error is raised when the file is not found
 def test_vgnc_file_not_found(vgnc_parser: VGNCParser, test_file_not_found: Callable[[VGNCParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py
index dda0c7bdc..1a8d36d90 100644
--- a/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py
+++ b/src/python/test/xrefs/parsers/test_xenopus_jamboree_parser.py
@@ -34,14 +34,10 @@ def run_and_validate_parsing(xenopus_jamboree_parser: XenopusJamboreeParser, moc
     ), f"{prefix}Expected '{expected_xrefs} XenopusJamboree xrefs successfully parsed' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_xenopus_jamboree_no_source_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_source_id: Callable[[XenopusJamboreeParser, int], None]) -> None:
-    test_no_source_id(xenopus_jamboree_parser, SPECIES_ID_XENOPUS)
-
-def test_xenopus_jamboree_no_species_id(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_species_id: Callable[[XenopusJamboreeParser, int], None]) -> None:
-    test_no_species_id(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE)
-
-def test_xenopus_jamboree_no_file(xenopus_jamboree_parser: XenopusJamboreeParser, test_no_file: Callable[[XenopusJamboreeParser, int, int], None]) -> None:
-    test_no_file(xenopus_jamboree_parser, SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
+def test_xenopus_jamboree_missing_argument(xenopus_jamboree_parser: XenopusJamboreeParser, test_parser_missing_argument: Callable[[XenopusJamboreeParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(xenopus_jamboree_parser, "source_id", SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
+    test_parser_missing_argument(xenopus_jamboree_parser, "species_id", SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
+    test_parser_missing_argument(xenopus_jamboree_parser, "file", SOURCE_ID_XENOPUS_JAMBOREE, SPECIES_ID_XENOPUS)
 
 # Test case to check if an error is raised when the file is not found
 def test_xenopus_jamboree_file_not_found(xenopus_jamboree_parser: XenopusJamboreeParser, test_file_not_found: Callable[[XenopusJamboreeParser, int, int], None]) -> None:
diff --git a/src/python/test/xrefs/parsers/test_zfin_desc_parser.py b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py
index 1ef373c46..ac3e52eed 100644
--- a/src/python/test/xrefs/parsers/test_zfin_desc_parser.py
+++ b/src/python/test/xrefs/parsers/test_zfin_desc_parser.py
@@ -6,7 +6,7 @@
 from test_helpers import check_row_count
 
 # Constants
-SOURCE_ID_ZFIN = 1
+SOURCE_ID_ZFIN_DESC = 1
 SPECIES_ID_ZEBRAFISH = 7955
 
 # Fixture to create a ZFINDescParser instance
@@ -21,7 +21,7 @@ def run_and_validate_parsing(zfin_desc_parser: ZFINDescParser, mock_xref_dbi: DB
 
     result_code, result_message = zfin_desc_parser.run(
         {
-            "source_id": SOURCE_ID_ZFIN,
+            "source_id": SOURCE_ID_ZFIN_DESC,
             "species_id": SPECIES_ID_ZEBRAFISH,
             "file": "parsers/flatfiles/zfin_desc.txt",
             "xref_dbi": mock_xref_dbi,
@@ -37,22 +37,18 @@ def run_and_validate_parsing(zfin_desc_parser: ZFINDescParser, mock_xref_dbi: DB
     ), f"{prefix}Expected '{expected_withdrawn} withdrawn entries ignored' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_zfin_desc_no_source_id(zfin_desc_parser: ZFINDescParser, test_no_source_id: Callable[[ZFINDescParser, int], None]) -> None:
-    test_no_source_id(zfin_desc_parser, SPECIES_ID_ZEBRAFISH)
-
-def test_zfin_desc_no_species_id(zfin_desc_parser: ZFINDescParser, test_no_species_id: Callable[[ZFINDescParser, int], None]) -> None:
-    test_no_species_id(zfin_desc_parser, SOURCE_ID_ZFIN)
-
-def test_zfin_desc_no_file(zfin_desc_parser: ZFINDescParser, test_no_file: Callable[[ZFINDescParser, int, int], None]) -> None:
-    test_no_file(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+def test_zfin_desc_missing_argument(zfin_desc_parser: ZFINDescParser, test_parser_missing_argument: Callable[[ZFINDescParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(zfin_desc_parser, "source_id", SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH)
+    test_parser_missing_argument(zfin_desc_parser, "species_id", SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH)
+    test_parser_missing_argument(zfin_desc_parser, "file", SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH)
 
 # Test case to check if an error is raised when the file is not found
 def test_zfin_desc_file_not_found(zfin_desc_parser: ZFINDescParser, test_file_not_found: Callable[[ZFINDescParser, int, int], None]) -> None:
-    test_file_not_found(zfin_desc_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+    test_file_not_found(zfin_desc_parser, SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH)
 
 # Test case to check if an error is raised when the file is empty
 def test_zfin_desc_empty_file(zfin_desc_parser: ZFINDescParser, test_empty_file: Callable[[ZFINDescParser, str, int, int], None]) -> None:
-    test_empty_file(zfin_desc_parser, 'ZFINDesc', SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+    test_empty_file(zfin_desc_parser, 'ZFINDesc', SOURCE_ID_ZFIN_DESC, SPECIES_ID_ZEBRAFISH)
 
 # Test case to check successful parsing of valid ZFINDesc data
 def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_desc_parser: ZFINDescParser) -> None:
@@ -60,4 +56,4 @@ def test_successful_parsing(mock_xref_dbi: DBConnection, zfin_desc_parser: ZFIND
     run_and_validate_parsing(zfin_desc_parser, mock_xref_dbi, 6, 3)
 
     # Check the row counts in the xref table
-    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='MISC' AND source_id={SOURCE_ID_ZFIN}")
\ No newline at end of file
+    check_row_count(mock_xref_dbi, "xref", 6, f"info_type='MISC' AND source_id={SOURCE_ID_ZFIN_DESC}")
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_zfin_parser.py b/src/python/test/xrefs/parsers/test_zfin_parser.py
index 060ffa2bc..4972fb1b8 100644
--- a/src/python/test/xrefs/parsers/test_zfin_parser.py
+++ b/src/python/test/xrefs/parsers/test_zfin_parser.py
@@ -108,14 +108,10 @@ def run_and_validate_parsing(zfin_parser: ZFINParser, mock_xref_dbi: DBConnectio
     ), f"{prefix}Expected '{expected_synonyms} synonyms loaded' in result_message, but got: '{result_message}'"
 
 # Test cases to check if mandatory parser arguments are passed: source_id, species_id, and file
-def test_zfin_no_source_id(zfin_parser: ZFINParser, test_no_source_id: Callable[[ZFINParser, int], None]) -> None:
-    test_no_source_id(zfin_parser, SPECIES_ID_ZEBRAFISH)
-
-def test_zfin_no_species_id(zfin_parser: ZFINParser, test_no_species_id: Callable[[ZFINParser, int], None]) -> None:
-    test_no_species_id(zfin_parser, SOURCE_ID_ZFIN)
-
-def test_zfin_no_file(zfin_parser: ZFINParser, test_no_file: Callable[[ZFINParser, int, int], None]) -> None:
-    test_no_file(zfin_parser, SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+def test_zfin_missing_argument(zfin_parser: ZFINParser, test_parser_missing_argument: Callable[[ZFINParser, str, int, int], None]) -> None:
+    test_parser_missing_argument(zfin_parser, "source_id", SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+    test_parser_missing_argument(zfin_parser, "species_id", SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
+    test_parser_missing_argument(zfin_parser, "file", SOURCE_ID_ZFIN, SPECIES_ID_ZEBRAFISH)
 
 # Test case to check if an error is raised when the required source_id is missing
 def test_zfin_missing_required_source_id(zfin_parser: ZFINParser, mock_xref_dbi: DBConnection, test_missing_required_source_id: Callable[[ZFINParser, DBConnection, str, int, int, str], None]) -> None:
diff --git a/src/python/test/xrefs/pytest.ini b/src/python/test/xrefs/pytest.ini
index b79469489..dbdc951d2 100644
--- a/src/python/test/xrefs/pytest.ini
+++ b/src/python/test/xrefs/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
+env = 
+    TEST_DB_URL = mysql://USER:PASS@HOST:PORT
+    TEST_SCRATCH_PATH = /homes/USER/tmp
 addopts = --verbose --tb=line
\ No newline at end of file
diff --git a/src/python/test/xrefs/test_checksum.py b/src/python/test/xrefs/test_checksum.py
new file mode 100644
index 000000000..4d86ad0c7
--- /dev/null
+++ b/src/python/test/xrefs/test_checksum.py
@@ -0,0 +1,104 @@
+import pytest
+import os
+import shutil
+import datetime
+from typing import Any, Dict, Callable, Optional
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count
+
+from ensembl.production.xrefs.Checksum import Checksum
+
+DEFAULT_ARGS = {
+    "base_path": "dummy_base_path",
+    "source_db_url": "mysql://user:pass@host/db",
+    "skip_download": False,
+}
+
+# Fixture to create a Checksum instance
+@pytest.fixture
+def checksum() -> Callable[[Optional[Dict[str, Any]]], Checksum]:
+    def _create_checksum(args: Optional[Dict[str, Any]] = None) -> Checksum:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return Checksum(args, True, True)
+    return _create_checksum
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_checksum_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("Checksum", DEFAULT_ARGS, "base_path")
+    test_missing_required_param("Checksum", DEFAULT_ARGS, "source_db_url")
+    test_missing_required_param("Checksum", DEFAULT_ARGS, "skip_download")
+
+# Test case to check successful run
+def test_successful_run(mock_source_dbi: DBConnection, checksum: Checksum, pytestconfig: pytest.Config):
+    # Setup for test parameters and create a Checksum instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = {
+        "base_path": test_scratch_path,
+        "source_db_url": mock_source_dbi.engine.url,
+        "skip_download": False,
+    }
+    checksum_instance = checksum(args)
+
+    checksum_path = os.path.join(test_scratch_path, "Checksum")
+    checksum_file = os.path.join(checksum_path, "checksum.txt")
+    try:
+        # Run the Checksum instance without checksum source files
+        checksum_instance.run()
+
+        # Check that the Checksum folder was created
+        assert os.path.exists(test_scratch_path), "Checksum folder was not created"
+
+        # Check that no checksum.txt file was created
+        assert not os.path.exists(checksum_file), "File checksum.txt was created"
+
+        # Copy some checksum files into the Checksum folder
+        shutil.copy("flatfiles/RNACentral-md5.tsv.gz", checksum_path)
+        shutil.copy("flatfiles/UniParc-upidump.lis", checksum_path)
+
+        # Run the Checksum instance again
+        checksum_instance.run()
+
+        # Check that the checksum.txt file was created and is not empty
+        assert os.path.exists(checksum_file), "File checksum.txt was not created"
+        assert os.path.getsize(checksum_file) > 0, "File checksum.txt is empty"
+
+        # Get the last modified time and size of the file
+        timestamp = os.path.getmtime(checksum_file)
+        last_modified = datetime.datetime.fromtimestamp(timestamp)
+        size = os.path.getsize(checksum_file)
+
+        # Check that the checksum rows were added
+        check_row_count(mock_source_dbi, "checksum_xref", 30)
+
+        # Run the Checksum instance again
+        checksum_instance.run()
+
+        # Check that the checksum.txt file was created again
+        timestamp = os.path.getmtime(checksum_file)
+        new_last_modified = datetime.datetime.fromtimestamp(timestamp)
+        assert new_last_modified > last_modified, "File checksum.txt was created again"
+        assert os.path.getsize(checksum_file) == size, "File checksum.txt does not have the same size"
+        last_modified = new_last_modified
+
+        # Check that the checksum rows are still the same
+        check_row_count(mock_source_dbi, "checksum_xref", 30)
+
+        # Set the skip_download parameter to True
+        checksum_instance.set_param("skip_download", True)
+
+        # Run the Checksum instance again
+        checksum_instance.run()
+
+        # Check that the checksum.txt file was not created again
+        timestamp = os.path.getmtime(checksum_file)
+        new_last_modified = datetime.datetime.fromtimestamp(timestamp)
+        assert new_last_modified == last_modified, "File checksum.txt was created again"
+
+        # Check that the checksum rows are still the same
+        check_row_count(mock_source_dbi, "checksum_xref", 30)
+    finally:
+        # Cleanup: Remove the Checksum folder if it exists
+        if os.path.exists(checksum_path):
+            shutil.rmtree(checksum_path)
\ No newline at end of file
diff --git a/src/python/test/xrefs/test_download_source.py b/src/python/test/xrefs/test_download_source.py
new file mode 100644
index 000000000..4e537ab5b
--- /dev/null
+++ b/src/python/test/xrefs/test_download_source.py
@@ -0,0 +1,118 @@
+import pytest
+import os
+import shutil
+import datetime
+from typing import Any, Dict, Callable, Optional
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count
+
+from ensembl.production.xrefs.DownloadSource import DownloadSource
+
+DEFAULT_ARGS = {
+    "base_path": "dummy_base_path",
+    "parser": "dummy_parser",
+    "name": "dummy_name",
+    "priority": 1,
+    "source_db_url": "mysql://user:pass@host/db",
+    "file": "dummy_file",
+    "skip_download": False,
+}
+
+# Fixture to create a DownloadSource instance
+@pytest.fixture
+def download_source() -> Callable[[Optional[Dict[str, Any]]], DownloadSource]:
+    def _create_download_source(args: Optional[Dict[str, Any]] = None) -> DownloadSource:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return DownloadSource(args, True, True)
+    return _create_download_source
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_download_source_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "base_path")
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "parser")
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "name")
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "priority")
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "source_db_url")
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "file")
+    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "skip_download")
+
+# Test case to check if an error is raised when an invalid URL scheme is provided
+def test_invalid_url_scheme(download_source: DownloadSource, pytestconfig):
+    # Setup for test parameters and create a ScheduleDownload instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = DEFAULT_ARGS.copy()
+    args["base_path"] = test_scratch_path
+    args["file"] = "wrong://dummy_file"
+    download_source_instance = download_source(args)
+
+    try:
+        # Run the DownloadSource instance
+        with pytest.raises(
+            AttributeError, match="Invalid URL scheme wrong"
+        ):
+            download_source_instance.run()
+    finally:
+        # Cleanup: Remove the created path if it exists
+        dummy_source_path = os.path.join(test_scratch_path, "dummy_name")
+        if os.path.exists(dummy_source_path):
+            shutil.rmtree(dummy_source_path)
+
+# TO DO: Add test cases to check for ftp and copy cases + downloading version files
+
+# Test case to check successful run
+def test_successful_run(mock_source_dbi: DBConnection, download_source: DownloadSource, pytestconfig: pytest.Config):
+    # Setup for test parameters and create a ScheduleDownload instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = {
+        "base_path": test_scratch_path,
+        "parser": "DBASSParser",
+        "name": "DBASS3",
+        "priority": 1,
+        "source_db_url": mock_source_dbi.engine.url,
+        "file": "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv",
+        "skip_download": False,
+    }
+    download_source_instance = download_source(args)
+
+    try:
+        # Run the DownloadSource instance
+        download_source_instance.run()
+
+        # Check if the file was downloaded
+        file_path = os.path.join(test_scratch_path, "DBASS3", "DownloadCsv")
+        assert os.path.exists(file_path), "DBASS3 file not downloaded into the correct path"
+
+        # Check if the source was added to the source table
+        check_row_count(mock_source_dbi, "source", 1)
+        check_row_count(mock_source_dbi, "version", 1)
+
+        # Get the last modified time of the file
+        timestamp = os.path.getmtime(file_path)
+        last_modified = datetime.datetime.fromtimestamp(timestamp)
+
+        # Run the DownloadSource instance again
+        download_source_instance.run()
+
+        # Check that the file was downloaded again
+        timestamp = os.path.getmtime(file_path)
+        new_last_modified = datetime.datetime.fromtimestamp(timestamp)
+        assert new_last_modified > last_modified, "DBASS3 file not downloaded again"
+        last_modified = new_last_modified
+
+        # Set the skip_download parameter to True
+        download_source_instance.set_param("skip_download", True)
+
+        # Run the DownloadSource instance again
+        download_source_instance.run()
+
+        # Check that the file was not downloaded again
+        timestamp = os.path.getmtime(file_path)
+        new_last_modified = datetime.datetime.fromtimestamp(timestamp)
+        assert new_last_modified == last_modified, "DBASS3 file downloaded again"
+    finally:
+        # Cleanup: Remove the created file and path if it exists
+        source_path = os.path.join(test_scratch_path, "DBASS3")
+        if os.path.exists(source_path):
+            shutil.rmtree(source_path)
\ No newline at end of file
diff --git a/src/python/test/xrefs/test_helpers.py b/src/python/test/xrefs/test_helpers.py
index efe1e35d4..6cbac3910 100644
--- a/src/python/test/xrefs/test_helpers.py
+++ b/src/python/test/xrefs/test_helpers.py
@@ -1,6 +1,8 @@
+import json
 from sqlalchemy import text
+from typing import List, Dict, Any
 
-from ensembl.utils.database import UnitTestDB, DBConnection
+from ensembl.utils.database import DBConnection
 
 # Helper function to check the row count in a specific table
 def check_row_count(db: DBConnection, table: str, expected_count: int, where_clause: str = None) -> None:
@@ -77,4 +79,23 @@ def check_release(db: DBConnection, source_id: str, expected_release: str) -> No
     ).scalar()
     assert (
         release == expected_release
-    ), f"Expected release info '{expected_release}' for source_id {source_id}, but got '{release}'"
\ No newline at end of file
+    ), f"Expected release info '{expected_release}' for source_id {source_id}, but got '{release}'"
+
+# Helper function to check the dataflow content of a dataflow file
+def check_dataflow_content(dataflow_file_path: str, expected_content: List[Dict[str, Any]]) -> None:
+    # Get the content of the dataflow file
+    actual_content = []
+    with open(dataflow_file_path) as fh:
+        for line in fh:
+            actual_content.append(json.loads(line.strip()))
+
+    # Sort both the expected and actual content lists
+    actual_content_sorted = sorted(actual_content, key=lambda x: json.dumps(x, sort_keys=True))
+    expected_content_sorted = sorted(expected_content, key=lambda x: json.dumps(x, sort_keys=True))
+
+    # Compare the expected and actual content
+    assert actual_content_sorted == expected_content_sorted, (
+        f"Dataflow file content does not match expected content.\n"
+        f"Expected (sorted): {expected_content_sorted}\n"
+        f"Actual (sorted): {actual_content_sorted}"
+    )
\ No newline at end of file
diff --git a/src/python/test/xrefs/test_schedule_alignment.py b/src/python/test/xrefs/test_schedule_alignment.py
new file mode 100644
index 000000000..2254a58e2
--- /dev/null
+++ b/src/python/test/xrefs/test_schedule_alignment.py
@@ -0,0 +1,106 @@
+import pytest
+import os
+import shutil
+from typing import Any, Dict, Callable, Optional
+from test_helpers import check_dataflow_content
+
+from ensembl.production.xrefs.ScheduleAlignment import ScheduleAlignment
+
+DEFAULT_ARGS = {
+    "species_name": "homo_sapiens",
+    "release": 999,
+    "ensembl_fasta": "dummy_ensembl_fasta.fa",
+    "xref_fasta": "dummy_xref_fasta.fa",
+    "seq_type": "peptide",
+    "xref_db_url": "mysql://user:pass@host/xref_db",
+    "base_path": "dummy_base_path",
+    "method": "--bestn 1",
+    "query_cutoff": 100,
+    "target_cutoff": 100,
+    "source_id": 1,
+    "source_name": "RefSeq_peptide",
+    "job_index": 1,
+    "chunk_size": 4000
+}
+
+# Fixture to create a ScheduleAlignment instance
+@pytest.fixture
+def schedule_alignment() -> Callable[[Optional[Dict[str, Any]]], ScheduleAlignment]:
+    def _create_schedule_alignment(args: Optional[Dict[str, Any]] = None) -> ScheduleAlignment:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return ScheduleAlignment(args, True, True)
+    return _create_schedule_alignment
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_schedule_alignment_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "species_name")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "release")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "ensembl_fasta")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_fasta")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "seq_type")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_db_url")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "base_path")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "method")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "query_cutoff")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "target_cutoff")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_id")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_name")
+    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "job_index")
+
+# Test case to check successful run
+def test_successful_run(schedule_alignment: ScheduleAlignment, pytestconfig: pytest.Config):
+    # Setup for test parameters and create a ScheduleAlignment instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = DEFAULT_ARGS.copy()
+    args["base_path"] = test_scratch_path
+    args["dataflow_output_path"] = test_scratch_path
+    schedule_alignment_instance = schedule_alignment(args)
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_alignment.json")
+    try:
+        # Create the appropriate paths and copy a fasta file
+        ensembl_path = schedule_alignment_instance.get_path(test_scratch_path, "homo_sapiens", 999, "ensembl")
+        shutil.copy("flatfiles/peptides.fa", ensembl_path)
+        ensembl_file_path = os.path.join(ensembl_path, "peptides.fa")
+        schedule_alignment_instance.set_param("ensembl_fasta", ensembl_file_path)
+
+        # Run the ScheduleAlignment instance
+        schedule_alignment_instance.run()
+
+        # Check that an alignment path was created
+        alignment_path = os.path.join(test_scratch_path, "homo_sapiens", "999", "alignment")
+        assert os.path.exists(alignment_path), f"Expected path {alignment_path} not created"
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {
+                "species_name": "homo_sapiens", "align_method": "--bestn 1", "query_cutoff": 100, "target_cutoff": 100, "max_chunks": 3, "chunk": 1,
+                "job_index": 1, "source_file": "dummy_xref_fasta.fa", "target_file": ensembl_file_path, "xref_db_url": "mysql://user:pass@host/xref_db",
+                "map_file": os.path.join(alignment_path, "peptide_alignment_1_1_of_3.map"), "source_id": 1, "source_name": "RefSeq_peptide", "seq_type": "peptide"
+            },
+            {
+                "species_name": "homo_sapiens", "align_method": "--bestn 1", "query_cutoff": 100, "target_cutoff": 100, "max_chunks": 3, "chunk": 2,
+                "job_index": 1, "source_file": "dummy_xref_fasta.fa", "target_file": ensembl_file_path, "xref_db_url": "mysql://user:pass@host/xref_db",
+                "map_file": os.path.join(alignment_path, "peptide_alignment_1_2_of_3.map"), "source_id": 1, "source_name": "RefSeq_peptide", "seq_type": "peptide"
+            },
+            {
+                "species_name": "homo_sapiens", "align_method": "--bestn 1", "query_cutoff": 100, "target_cutoff": 100, "max_chunks": 3, "chunk": 3,
+                "job_index": 1, "source_file": "dummy_xref_fasta.fa", "target_file": ensembl_file_path, "xref_db_url": "mysql://user:pass@host/xref_db",
+                "map_file": os.path.join(alignment_path, "peptide_alignment_1_3_of_3.map"), "source_id": 1, "source_name": "RefSeq_peptide", "seq_type": "peptide"
+            }
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
+
+        # Cleanup: Remove the homo_sapiens folder if it exists
+        ensembl_path = os.path.join(test_scratch_path, "homo_sapiens")
+        if os.path.exists(ensembl_path):
+            shutil.rmtree(ensembl_path)
\ No newline at end of file
diff --git a/src/python/test/xrefs/test_schedule_cleanup.py b/src/python/test/xrefs/test_schedule_cleanup.py
new file mode 100644
index 000000000..6d8c8721d
--- /dev/null
+++ b/src/python/test/xrefs/test_schedule_cleanup.py
@@ -0,0 +1,116 @@
+import pytest
+import os
+import shutil
+from sqlalchemy import text
+from typing import Any, Dict, Callable, Optional
+from ensembl.utils.database import DBConnection
+from test_helpers import check_dataflow_content
+
+from ensembl.production.xrefs.ScheduleCleanup import ScheduleCleanup
+
+DEFAULT_ARGS = {
+    "base_path": "dummy_base_path",
+    "source_db_url": "mysql://user:pass@host/db",
+}
+
+# Fixture to create a ScheduleCleanup instance
+@pytest.fixture
+def schedule_cleanup() -> Callable[[Optional[Dict[str, Any]]], ScheduleCleanup]:
+    def _create_schedule_cleanup(args: Optional[Dict[str, Any]] = None) -> ScheduleCleanup:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return ScheduleCleanup(args, True, True)
+    return _create_schedule_cleanup
+
+# Function to populate the database with sources
+def populate_source_db(mock_source_dbi: DBConnection):
+    source_data = [
+        [1, 'DBASS3', 'DBASSParser'],
+        [2, 'RefSeq_dna', 'RefSeqParser'],
+        [3, 'Uniprot/SWISSPROT', 'UniProtParser'],
+        [4, 'VGNC', 'VGNCParser'],
+    ]
+    for row in source_data:
+        mock_source_dbi.execute(
+            text("INSERT INTO source (source_id, name, parser) VALUES (:source_id, :name, :parser)"),
+            {"source_id": row[0], "name": row[1], "parser": row[2],}
+        )
+
+    version_data = [
+        [1, 1, ''],
+        [2, 2, 'dummy_base_path/RefSeq_dna/RefSeq-release200.txt'],
+        [3, 1, 'dummy_base_path/UniprotSWISSPROT/reldate.txt'],
+        [4, 1, ''],
+    ]
+    for row in version_data:
+        mock_source_dbi.execute(
+            text("INSERT INTO version (source_id, priority, revision) VALUES (:source_id, :priority, :revision)"),
+            {"source_id": row[0], "priority": row[1], "revision": row[2],}
+        )
+
+    mock_source_dbi.commit()
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_schedule_cleanup_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("ScheduleCleanup", DEFAULT_ARGS, "base_path")
+    test_missing_required_param("ScheduleCleanup", DEFAULT_ARGS, "source_db_url")
+
+# Test case to check successful run
+def test_successful_run(mock_source_dbi: DBConnection, schedule_cleanup: ScheduleCleanup, pytestconfig: pytest.Config):
+    # Setup for test parameters and create a ScheduleCleanup instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = {
+        "base_path": test_scratch_path,
+        "source_db_url": mock_source_dbi.engine.url,
+        "dataflow_output_path": test_scratch_path
+    }
+    schedule_cleanup_instance = schedule_cleanup(args)
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_cleanup_sources.json")
+    try:
+        # Run the DownloadSource instance without any sources to clean up
+        schedule_cleanup_instance.run()
+
+        # Check that the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check that the dataflow file is empty then remove it
+        assert os.path.getsize(dataflow_file_path) == 0, f"Expected file {dataflow_file_path} to be empty"
+        os.remove(dataflow_file_path)
+
+        # Add source data into source db
+        populate_source_db(mock_source_dbi)
+
+        # Run the ScheduleCleanup instance again without existing source folders
+        schedule_cleanup_instance.run()
+
+        # Check that the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check that the dataflow file is empty then remove it
+        assert os.path.getsize(dataflow_file_path) == 0, f"Expected file {dataflow_file_path} to be empty"
+        os.remove(dataflow_file_path)
+
+        # Create source folders for cleanup
+        os.makedirs(f"{test_scratch_path}/RefSeq_dna")
+        os.makedirs(f"{test_scratch_path}/UniprotSWISSPROT")
+
+        # Run the ScheduleCleanup instance again
+        schedule_cleanup_instance.run()
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"name": "RefSeq_dna", "version_file": "dummy_base_path/RefSeq_dna/RefSeq-release200.txt"},
+            {"name": "Uniprot/SWISSPROT", "version_file": "dummy_base_path/UniprotSWISSPROT/reldate.txt"}
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
+
+        # Cleanup: Remove the created paths if they exist
+        for source_path in [os.path.join(test_scratch_path, "RefSeq_dna"), os.path.join(test_scratch_path, "UniprotSWISSPROT")]:
+            if os.path.exists(source_path):
+                shutil.rmtree(source_path)
diff --git a/src/python/test/xrefs/test_schedule_download.py b/src/python/test/xrefs/test_schedule_download.py
new file mode 100644
index 000000000..8c17eb123
--- /dev/null
+++ b/src/python/test/xrefs/test_schedule_download.py
@@ -0,0 +1,116 @@
+import pytest
+import io
+import json
+import os
+from datetime import datetime
+from unittest.mock import MagicMock, patch
+from typing import Any, Dict, Callable, Optional
+from sqlalchemy import create_engine, text
+from sqlalchemy.engine.url import make_url
+from test_helpers import check_dataflow_content
+
+from ensembl.production.xrefs.ScheduleDownload import ScheduleDownload
+
+DEFAULT_ARGS = {
+    "config_file": "dummy_config.json",
+    "source_db_url": "mysql://user:pass@host/db",
+    "reuse_db": False,
+}
+
+# Fixture to create a ScheduleDownload instance
+@pytest.fixture
+def schedule_download() -> Callable[[Optional[Dict[str, Any]]], ScheduleDownload]:
+    def _create_schedule_download(args: Optional[Dict[str, Any]] = None) -> ScheduleDownload:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return ScheduleDownload(args, True, True)
+    return _create_schedule_download
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_schedule_download_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("ScheduleDownload", DEFAULT_ARGS, "config_file")
+    test_missing_required_param("ScheduleDownload", DEFAULT_ARGS, "source_db_url")
+    test_missing_required_param("ScheduleDownload", DEFAULT_ARGS, "reuse_db")
+
+# Test case to check if an error is raised when the config file has an invalid json format
+def test_invalid_config_file(schedule_download: ScheduleDownload):
+    # Create a ScheduleDownload instance
+    schedule_download_instance = schedule_download()
+
+    # Create an invalid json file
+    mock_file = io.StringIO('[{"name": "source1", "parser": "parser1", "priority": 1, "file": "file1",}]')
+    with patch("ensembl.production.xrefs.ScheduleDownload.open", return_value=mock_file, create=True):
+        # Mock the create_source_db method
+        schedule_download_instance.create_source_db = MagicMock()
+
+        # Run the ScheduleDownload instance
+        with pytest.raises(json.decoder.JSONDecodeError):
+            schedule_download_instance.run()
+
+# Test case to check if an error is raised when the config file is empty
+def test_empty_config_file(schedule_download: ScheduleDownload):
+    # Create a ScheduleDownload instance
+    schedule_download_instance = schedule_download()
+
+    # Create an empty json file
+    mock_file = io.StringIO('[]')
+    with patch("ensembl.production.xrefs.ScheduleDownload.open", return_value=mock_file, create=True):
+        # Mock the create_source_db method
+        schedule_download_instance.create_source_db = MagicMock()
+
+        # Run the ScheduleDownload instance
+        with pytest.raises(
+            ValueError, match="No sources found in config file dummy_config.json. Need sources to run pipeline"
+        ):
+            schedule_download_instance.run()
+
+# TO DO: Add test case for reuse_db set to True
+
+# Test case to check successful run
+def test_successful_run(schedule_download: ScheduleDownload, pytestconfig):
+    # Setup for test parameters and create a ScheduleDownload instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    user_name = os.getenv("USER", "default_user")
+    test_db_name = f"{user_name}_test_xref_source_db_{timestamp}"
+    args = {
+        "config_file": "flatfiles/sources.json",
+        "source_db_url": f"{test_mysql_url}/{test_db_name}",
+        "reuse_db": False,
+        "dataflow_output_path": test_scratch_path
+    }
+    schedule_download_instance = schedule_download(args)
+
+    # Create a db engine for connection
+    test_engine = create_engine(make_url(test_mysql_url), isolation_level="AUTOCOMMIT")
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_sources.json")
+    try:
+        # Run the ScheduleDownload instance
+        schedule_download_instance.run()
+
+        # Check if the source db was created
+        with test_engine.connect() as conn:
+            result = conn.execute(text("SHOW DATABASES"))
+            db_names = [row[0] for row in result.fetchall()]
+            assert test_db_name in db_names, f"Expected database {test_db_name} not found"
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"parser": "ArrayExpressParser", "name": "ArrayExpress", "priority": 1, "db": "core", "file": "Database"},
+            {"parser": "ChecksumParser", "name": "RNACentral", "priority": 1, "db": "checksum", "file": "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz"}
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup: Drop the test database if it exists
+        with test_engine.connect() as conn:
+            conn.execute(text(f"DROP DATABASE IF EXISTS {test_db_name}"))
+
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
diff --git a/src/python/test/xrefs/test_schedule_parse.py b/src/python/test/xrefs/test_schedule_parse.py
new file mode 100644
index 000000000..04b3cd4ca
--- /dev/null
+++ b/src/python/test/xrefs/test_schedule_parse.py
@@ -0,0 +1,224 @@
+import pytest
+import os
+from unittest.mock import MagicMock
+from sqlalchemy import create_engine, text
+from sqlalchemy.engine.url import make_url
+from typing import Any, Dict, Callable, Optional
+from ensembl.utils.database import DBConnection
+from test_helpers import check_row_count, check_dataflow_content
+
+from ensembl.production.xrefs.ScheduleParse import ScheduleParse
+
+DEFAULT_ARGS = {
+    "species_name": "test_homo_sapiens_test",
+    "release": 999,
+    "registry_url": "http://dummy_registry",
+    "priority": 1,
+    "source_db_url": "mysql://user:pass@host/source_db",
+    "xref_db_url": "mysql://user:pass@host/xref_db",
+    "get_species_file": False,
+    "species_db": "mysql://user:pass@host/core_db",
+}
+
+# Fixture to create a ScheduleParse instance
+@pytest.fixture
+def schedule_parse() -> Callable[[Optional[Dict[str, Any]]], ScheduleParse]:
+    def _create_schedule_parse(args: Optional[Dict[str, Any]] = None) -> ScheduleParse:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return ScheduleParse(args, True, True)
+    return _create_schedule_parse
+
+# Function to populate the database with sources
+def populate_source_db(mock_source_dbi: DBConnection):
+    source_data = [
+        [1, 'ArrayExpress', 'ArrayExpressParser'],
+        [2, 'UniParc', 'ChecksumParser'],
+        [3, 'DBASS3', 'DBASSParser'],
+        [4, 'MIM', 'MIMParser'],
+        [5, 'Reactome', 'ReactomeParser'],
+        [6, 'RefSeq_dna', 'RefSeqParser'],
+        [7, 'RefSeq_peptide', 'RefSeqParser'],
+        [8, 'VGNC', 'VGNCParser'],
+    ]
+    for row in source_data:
+        mock_source_dbi.execute(
+            text("INSERT INTO source (source_id, name, parser) VALUES (:source_id, :name, :parser)"),
+            {"source_id": row[0], "name": row[1], "parser": row[2],}
+        )
+
+    version_data = [
+        [1, 'Database', 'core', 1, None, None],
+        [2, 'dummy_uniparc_file_path', 'checksum', 1, None, None],
+        [3, 'dummy_dbass_file_path', None, 1, None, None],
+        [4, 'dummy_mim_file_path', None, 2, None, None],
+        [5, 'dummy_reactome_file_path', None, 2, 'dummy_reactome_release', None],
+        [6, 'dummy_refseq_dna_file_path', None, 2, 'dummy_refseq_dna_release', 'dummy_refseq_dna_clean_path'],
+        [7, 'dummy_refseq_peptide_file_path', None, 3, 'dummy_refseq_peptide_release', 'dummy_refseq_peptide_clean_path'],
+        [8, 'dummy_vgnc_file_path', None, 1, None, None],
+    ]
+    for row in version_data:
+        mock_source_dbi.execute(
+            text("INSERT INTO version (source_id, file_path, db, priority, revision, clean_path) VALUES (:source_id, :file_path, :db, :priority, :revision, :clean_path)"),
+            {"source_id": row[0], "file_path": row[1], "db": row[2], "priority": row[3], "revision": row[4], "clean_path": row[5]}
+        )
+
+    mock_source_dbi.commit()
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_schedule_parse_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "species_name")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "release")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "registry_url")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "priority")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "source_db_url")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "xref_db_url")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "get_species_file")
+    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "sources_config_file")
+
+# Test case to check if an error is raised when priority is invalid
+def test_invalid_priority(schedule_parse: ScheduleParse):
+    args = DEFAULT_ARGS.copy()
+    args["priority"] = 4
+    schedule_parse_instance = schedule_parse(args)
+
+    with pytest.raises(AttributeError, match="Parameter 'priority' can only be of value 1, 2, or 3"):
+        schedule_parse_instance.run()
+
+# Test case to check successful run
+def test_successful_run(mock_source_dbi: DBConnection, schedule_parse: ScheduleParse, pytestconfig):
+    # Setup for test parameters and create a ScheduleParse instance
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    args = DEFAULT_ARGS.copy()
+    args["source_db_url"] = mock_source_dbi.engine.url
+    args["xref_db_url"] = test_mysql_url
+    args["dataflow_output_path"] = test_scratch_path
+    args["sources_config_file"] = "flatfiles/config.ini"
+    schedule_parse_instance = schedule_parse(args)
+
+    # Add source data into source db
+    populate_source_db(mock_source_dbi)
+
+    # Mock needed methods
+    schedule_parse_instance.get_core_db_info = MagicMock(return_value=(9606, 7742))
+
+    # Create a db engine for connection
+    test_engine = create_engine(make_url(test_mysql_url), isolation_level="AUTOCOMMIT")
+
+    try:
+        # Run the ScheduleParse instance with priority 1
+        schedule_parse_instance.run()
+
+        # Check if the xref update db was created
+        with test_engine.connect() as conn:
+            result = conn.execute(text("SHOW DATABASES"))
+            db_names = [row[0] for row in result.fetchall()]
+            assert "test_homo_sapiens_test_xref_update_999" in db_names, "Expected database test_homo_sapiens_test_xref_update_999 not found"
+
+            # Connect to the db itself and create a table
+            db_engine = create_engine(make_url(f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"), isolation_level="AUTOCOMMIT")
+            with db_engine.connect() as db_conn:
+                check_row_count(db_conn, "source", 11)
+                check_row_count(db_conn, "source_url", 14)
+                check_row_count(db_conn, "species", 3)
+
+                # Get the source ids
+                source_ids = {}
+                result = db_conn.execute(text("SELECT source_id,name,priority_description FROM source")).all()
+                for row in result:
+                    if source_ids.get(row[1]):
+                        source_ids[row[1]].update({row[2]: row[0]})
+                    else:
+                        source_ids[row[1]] = {row[2]: row[0]}
+
+                # Get
+
+        # Check the dataflow files
+        expected_content = {
+            "primary_sources": [
+                {
+                    "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999",
+                    "source_id": source_ids["ArrayExpress"]["multi"], "source_name": "ArrayExpress", "parser": "ArrayExpressParser", "db": "core", "file_name": "Database"
+                },
+                {
+                    "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999",
+                    "source_id": source_ids["DBASS3"]["human"], "source_name": "DBASS3", "parser": "DBASSParser", "file_name": "dummy_dbass_file_path"
+                }
+            ],
+            "schedule_secondary": [
+                {"species_name": "test_homo_sapiens_test", "species_db": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"}
+            ]
+        }
+        for dataflow_file in ["primary_sources", "schedule_secondary"]:
+            # Check if the dataflow file is created
+            dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json")
+
+            # Check the content of the dataflow file
+            check_dataflow_content(dataflow_file_path, expected_content[dataflow_file])
+
+        # Run the ScheduleParse instance again with priority 2
+        schedule_parse_instance.set_param("priority", 2)
+        schedule_parse_instance.set_param("xref_db_url", f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999")
+        schedule_parse_instance.run()
+
+        # Check the dataflow files
+        expected_content = {
+            "secondary_sources": [
+                {
+                    "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999",
+                    "source_id": source_ids["MIM"]["human"], "source_name": "MIM", "parser": "MIMParser", "file_name": "dummy_mim_file_path"
+                },
+                {
+                    "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999",
+                    "source_id": source_ids["Reactome"]["multi"], "source_name": "Reactome", "parser": "ReactomeParser", "release_file": "dummy_reactome_release", "file_name": "dummy_reactome_file_path"
+                },
+                {
+                    "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999",
+                    "source_id": source_ids["RefSeq_dna"]["human"], "source_name": "RefSeq_dna", "parser": "RefSeqParser", "release_file": "dummy_refseq_dna_release", "file_name": "dummy_refseq_dna_clean_path"
+                }
+            ],
+            "schedule_tertiary": [
+                {"species_name": "test_homo_sapiens_test", "species_db": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"}
+            ]
+        }
+        for dataflow_file in ["secondary_sources", "schedule_tertiary"]:
+            # Check if the dataflow file is created
+            dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json")
+
+            # Check the content of the dataflow file
+            check_dataflow_content(dataflow_file_path, expected_content[dataflow_file])
+
+        # Run the ScheduleParse instance again with priority 2
+        schedule_parse_instance.set_param("priority", 3)
+        schedule_parse_instance.run()
+
+        # Check the dataflow files
+        expected_content = {
+            "tertiary_sources": [
+                {
+                    "species_name": "test_homo_sapiens_test", "species_id": 9606, "core_db_url": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999",
+                    "source_id": source_ids["RefSeq_peptide"]["human"], "source_name": "RefSeq_peptide", "parser": "RefSeqParser", "release_file": "dummy_refseq_peptide_release", "file_name": "dummy_refseq_peptide_clean_path"
+                }
+            ],
+            "dump_ensembl": [
+                {"species_name": "test_homo_sapiens_test", "species_db": "mysql://user:pass@host/core_db", "xref_db_url": f"{test_mysql_url}/test_homo_sapiens_test_xref_update_999"}
+            ]
+        }
+        for dataflow_file in ["tertiary_sources", "dump_ensembl"]:
+            # Check if the dataflow file is created
+            dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json")
+
+            # Check the content of the dataflow file
+            check_dataflow_content(dataflow_file_path, expected_content[dataflow_file])
+    finally:
+        # Cleanup: Drop the test database if it exists
+        with test_engine.connect() as conn:
+            conn.execute(text("DROP DATABASE IF EXISTS test_homo_sapiens_test_xref_update_999"))
+
+        # Cleanup: Remove the dataflow files if they exist
+        for dataflow_file in ["primary_sources", "schedule_secondary", "secondary_sources", "schedule_tertiary", "tertiary_sources", "dump_ensembl"]:
+            dataflow_file_path = os.path.join(test_scratch_path, f"dataflow_{dataflow_file}.json")
+            if os.path.exists(dataflow_file_path):
+                os.remove(dataflow_file_path)
diff --git a/src/python/test/xrefs/test_schedule_species.py b/src/python/test/xrefs/test_schedule_species.py
new file mode 100644
index 000000000..a018f7861
--- /dev/null
+++ b/src/python/test/xrefs/test_schedule_species.py
@@ -0,0 +1,358 @@
+import pytest
+import os
+import re
+from typing import Any, Dict, Callable, Optional, List
+from sqlalchemy import create_engine, text
+from sqlalchemy.engine.url import make_url
+from test_helpers import check_dataflow_content
+
+from ensembl.production.xrefs.ScheduleSpecies import ScheduleSpecies
+
+DEFAULT_ARGS = {
+    "run_all": False,
+    "registry_url": "http://dummy_registry",
+    "release": 999,
+    "metasearch_url": "http://dummy_metasearch",
+}
+
+# Fixture to create a ScheduleSpecies instance
+@pytest.fixture
+def schedule_species() -> Callable[[Optional[Dict[str, Any]]], ScheduleSpecies]:
+    def _create_schedule_species(args: Optional[Dict[str, Any]] = None) -> ScheduleSpecies:
+        # Use provided args or default to default_args
+        args = args or DEFAULT_ARGS
+
+        return ScheduleSpecies(args, True, True)
+    return _create_schedule_species
+
+# Function to create dbs in the registry
+def create_dbs_in_registry(registry_url: str, dbs: Dict[str, Dict[str, Any]]) -> List[str]:
+    dbs_to_cleanup = []
+
+    test_engine = create_engine(make_url(registry_url), isolation_level="AUTOCOMMIT")
+    with test_engine.connect() as conn:
+        # Get all dbs in the registry first
+        existing_dbs = conn.execute(text(f"SHOW DATABASES")).fetchall()
+        existing_dbs = [db[0] for db in existing_dbs]
+
+        # Create the dbs that are not already in the registry
+        for db_name, db_meta in dbs.items():
+            if db_name not in existing_dbs:
+                conn.execute(text(f"CREATE DATABASE {db_name}"))
+                dbs_to_cleanup.append(db_name)
+
+                release = db_meta.get("release")
+                division = db_meta.get("division")
+
+                # Connect to the db itself and create a table
+                db_engine = create_engine(make_url(f"{registry_url}/{db_name}"), isolation_level="AUTOCOMMIT")
+                with db_engine.connect() as db_conn:
+                    db_conn.execute(text("CREATE TABLE dna (seq_region_id INT(10) PRIMARY KEY, sequence VARCHAR(255) NOT NULL)"))
+                    db_conn.execute(text("CREATE TABLE meta (meta_id INT(10) AUTO_INCREMENT PRIMARY KEY, species_id INT(10) DEFAULT 1, meta_key VARCHAR(40) NOT NULL, meta_value VARCHAR(255) NOT NULL)"))
+                    db_conn.execute(text(f"INSERT INTO meta (meta_key, meta_value) VALUES ('schema_version', '{release}')"))
+                    if division:
+                        db_conn.execute(text(f"INSERT INTO meta (meta_key, meta_value) VALUES ('species.division', '{division}')"))
+
+    return dbs_to_cleanup
+
+# Function to cleanup dbs in the registry
+def cleanup_dbs_in_registry(registry_url: str, dbs: List[str]) -> None:
+    test_engine = create_engine(make_url(registry_url), isolation_level="AUTOCOMMIT")
+    with test_engine.connect() as conn:
+        for db in dbs:
+            conn.execute(text(f"DROP DATABASE IF EXISTS {db}"))
+
+def clean_registry_url(registry_url: str) -> str:
+    match = re.search(r"^(.*)://(.*)", registry_url)
+    if match:
+        registry_url = match.group(2)
+    match = re.search(r"(.*)/(.*)$", registry_url)
+    if match:
+        registry_url = match.group(1)
+
+    return registry_url
+
+# Test case to check if an error is raised when a mandatory parameter is missing
+def test_schedule_species_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
+    test_missing_required_param("ScheduleSpecies", DEFAULT_ARGS, "run_all")
+    test_missing_required_param("ScheduleSpecies", DEFAULT_ARGS, "registry_url")
+    test_missing_required_param("ScheduleSpecies", DEFAULT_ARGS, "release")
+
+# Test case to check if an error is raised when no species or division are provided
+def test_invalid_input(schedule_species: ScheduleSpecies):
+    # Create a ScheduleSpecies instance
+    schedule_species_instance = schedule_species()
+
+    with pytest.raises(ValueError, match="Must provide species or division with run_all set to False"):
+        schedule_species_instance.run()
+
+# Test case to check if an error is raised when no dbs are found (empty registry)
+def test_no_dbs_found(schedule_species: ScheduleSpecies):
+    # Create a ScheduleSpecies instance
+    args = DEFAULT_ARGS.copy()
+    args["run_all"] = True
+    schedule_species_instance = schedule_species(args)
+
+    with pytest.raises(LookupError, match="Could not find any matching dbs in registry dummy_registry"):
+        schedule_species_instance.run()
+
+# Test case to check if an error is raised when a species db is present more than once
+def test_duplicate_species_dbs(schedule_species: ScheduleSpecies, pytestconfig):
+    # Create a ScheduleSpecies instance
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    args = DEFAULT_ARGS.copy()
+    args["registry_url"] = test_mysql_url
+    args["run_all"] = True
+    schedule_species_instance = schedule_species(args)
+
+    # Create the dbs in the registry
+    dbs = {
+        "bos_taurus_core_999_1" : {"release": 999},
+        "bos_taurus_core_999_1_temp": {"release": 999},
+    }
+    created_dbs = create_dbs_in_registry(test_mysql_url, dbs)
+
+    clean_url = clean_registry_url(test_mysql_url)
+    try:
+        with pytest.raises(ValueError, match=f"Database {clean_url}/bos_taurus_core_999_1 already loaded for species bos_taurus, cannot load second database {clean_url}/bos_taurus_core_999_1_temp"):
+            schedule_species_instance.run()
+    finally:
+        # Cleanup the dbs in the registry
+        cleanup_dbs_in_registry(test_mysql_url, created_dbs)
+
+# Test case to check if an error is raised when a requested species is not found
+def test_species_not_found(schedule_species: ScheduleSpecies, pytestconfig):
+    # Create a ScheduleSpecies instance
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    args = DEFAULT_ARGS.copy()
+    args["registry_url"] = test_mysql_url
+    args["species"] = ["species1"]
+    schedule_species_instance = schedule_species(args)
+
+    with pytest.raises(LookupError, match="Database not found for species1, check registry parameters"):
+        schedule_species_instance.run()
+
+# Test case to check successful run with run_all parameter
+def test_successful_run_all(schedule_species: ScheduleSpecies, pytestconfig):
+    # Create a ScheduleSpecies instance
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = DEFAULT_ARGS.copy()
+    args["registry_url"] = test_mysql_url
+    args["run_all"] = True
+    args["dataflow_output_path"] = test_scratch_path
+    schedule_species_instance = schedule_species(args)
+
+    # Create the dbs in the registry
+    dbs = {
+        "bos_taurus_core_999_1": {"release": 999},
+        "danio_rerio_core_999_1": {"release": 999},
+        "equus_caballus_core_999_1": {"release": 999},
+        "homo_sapiens_core_998_1": {"release": 998},
+    }
+    created_dbs = create_dbs_in_registry(test_mysql_url, dbs)
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json")
+    clean_url = clean_registry_url(test_mysql_url)
+    try:
+        # Run the ScheduleSpecies instance
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"},
+            {"species_name": "danio_rerio", "species_db": f"{clean_url}/danio_rerio_core_999_1"},
+            {"species_name": "equus_caballus", "species_db": f"{clean_url}/equus_caballus_core_999_1"}
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup the dbs in the registry
+        cleanup_dbs_in_registry(test_mysql_url, created_dbs)
+
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
+
+# Test case to check successful run with run_all parameter with db_prefix set
+def test_successful_run_all_prefix(schedule_species: ScheduleSpecies, pytestconfig):
+    # Create a ScheduleSpecies instance
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = DEFAULT_ARGS.copy()
+    args["registry_url"] = test_mysql_url
+    args["run_all"] = True
+    args["db_prefix"] = "testprefix"
+    args["dataflow_output_path"] = test_scratch_path
+    schedule_species_instance = schedule_species(args)
+
+    # Create the dbs in the registry
+    dbs = {
+        "bos_taurus_core_999_1": {"release": 999},
+        "danio_rerio_core_999_1": {"release": 999},
+        "equus_caballus_core_999_1": {"release": 999},
+        "homo_sapiens_core_998_1": {"release": 998},
+        "testprefix_homo_sapiens_core_999_1": {"release": 999},
+    }
+    created_dbs = create_dbs_in_registry(test_mysql_url, dbs)
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json")
+    clean_url = clean_registry_url(test_mysql_url)
+    try:
+        # Run the ScheduleSpecies instance
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"species_name": "homo_sapiens", "species_db": f"{clean_url}/testprefix_homo_sapiens_core_999_1"},
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup the dbs in the registry
+        cleanup_dbs_in_registry(test_mysql_url, created_dbs)
+
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
+
+# Test case to check successful run with specified species
+def test_successful_run_species(schedule_species: ScheduleSpecies, pytestconfig):
+    # Create a ScheduleSpecies instance
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = DEFAULT_ARGS.copy()
+    args["registry_url"] = test_mysql_url
+    args["dataflow_output_path"] = test_scratch_path
+    args["species"] = ["bos_taurus", "danio_rerio"]
+    schedule_species_instance = schedule_species(args)
+
+    # Create the dbs in the registry
+    dbs = {
+        "bos_taurus_core_999_1": {"release": 999},
+        "danio_rerio_core_999_1": {"release": 999},
+        "equus_caballus_core_999_1": {"release": 999},
+        "homo_sapiens_core_998_1": {"release": 998},
+    }
+    created_dbs = create_dbs_in_registry(test_mysql_url, dbs)
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json")
+    clean_url = clean_registry_url(test_mysql_url)
+    try:
+        # Run the ScheduleSpecies instance
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file then remove it
+        expected_content = [
+            {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"},
+            {"species_name": "danio_rerio", "species_db": f"{clean_url}/danio_rerio_core_999_1"},
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+        os.remove(dataflow_file_path)
+
+        # Change the antispecies
+        schedule_species_instance.set_param("antispecies", ["danio_rerio"])
+
+        # Run the ScheduleSpecies instance again
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"},
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup the dbs in the registry
+        cleanup_dbs_in_registry(test_mysql_url, created_dbs)
+
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
+
+# Test case to check successful run with specified division
+def test_successful_run_division(schedule_species: ScheduleSpecies, pytestconfig):
+    # Create a ScheduleSpecies instance
+    test_mysql_url = pytestconfig.getoption("test_db_url")
+    test_scratch_path = pytestconfig.getoption("test_scratch_path")
+    args = DEFAULT_ARGS.copy()
+    args["registry_url"] = test_mysql_url
+    args["dataflow_output_path"] = test_scratch_path
+    args["division"] = "EnsemblVertebrates"
+    schedule_species_instance = schedule_species(args)
+
+    # Create the dbs in the registry
+    dbs = {
+        "bos_taurus_core_999_1": {"release": 999, "division": "EnsemblVertebrates"},
+        "danio_rerio_core_999_1": {"release": 999, "division": "EnsemblVertebrates"},
+        "equus_caballus_core_999_1": {"release": 999, "division": "EnsemblVertebrates"},
+        "equus_caballus_core_998_1": {"release": 998, "division": "EnsemblVertebrates"},
+        "zea_mays_core_999_1": {"release": 999, "division": "EnsemblPlants"},
+    }
+    created_dbs = create_dbs_in_registry(test_mysql_url, dbs)
+
+    dataflow_file_path = os.path.join(test_scratch_path, "dataflow_species.json")
+    clean_url = clean_registry_url(test_mysql_url)
+    try:
+        # Run the ScheduleSpecies instance
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file then remove it
+        expected_content = [
+            {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"},
+            {"species_name": "danio_rerio", "species_db": f"{clean_url}/danio_rerio_core_999_1"},
+            {"species_name": "equus_caballus", "species_db": f"{clean_url}/equus_caballus_core_999_1"},
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+        os.remove(dataflow_file_path)
+
+        # Change the antispecies
+        schedule_species_instance.set_param("antispecies", ["danio_rerio"])
+
+        # Run the ScheduleSpecies instance again
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"species_name": "bos_taurus", "species_db": f"{clean_url}/bos_taurus_core_999_1"},
+            {"species_name": "equus_caballus", "species_db": f"{clean_url}/equus_caballus_core_999_1"},
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+        os.remove(dataflow_file_path)
+
+        # Change the division
+        schedule_species_instance.set_param("division", "EnsemblPlants")
+
+        # Run the ScheduleSpecies instance again
+        schedule_species_instance.run()
+
+        # Check if the dataflow file is created
+        assert os.path.exists(dataflow_file_path), f"Expected file {dataflow_file_path} not found"
+
+        # Check the content of the dataflow file
+        expected_content = [
+            {"species_name": "zea_mays", "species_db": f"{clean_url}/zea_mays_core_999_1"},
+        ]
+        check_dataflow_content(dataflow_file_path, expected_content)
+    finally:
+        # Cleanup the dbs in the registry
+        cleanup_dbs_in_registry(test_mysql_url, created_dbs)
+
+        # Cleanup: Remove the dataflow file if it exists
+        if os.path.exists(dataflow_file_path):
+            os.remove(dataflow_file_path)
\ No newline at end of file

From 5c5f862f34c0d3d202830faedf1e7c89affedec1 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-04.ebi.ac.uk>
Date: Mon, 18 Nov 2024 12:57:00 +0000
Subject: [PATCH 04/12] Changes/fixes to the download pipeline

---
 scripts/xrefs/cleanup_and_split_source.pl     | 104 +--
 scripts/xrefs/cleanup_source.pl               | 268 +++---
 src/python/ensembl/common/Params.py           | 348 +++++---
 src/python/ensembl/production/xrefs/Base.py   | 810 ++++++++----------
 .../ensembl/production/xrefs/Checksum.py      |  26 +-
 .../production/xrefs/DownloadSource.py        |  59 +-
 .../production/xrefs/EmailNotification.py     | 399 ++++-----
 .../production/xrefs/ScheduleCleanup.py       |  33 +-
 .../production/xrefs/ScheduleDownload.py      |  17 +-
 .../xrefs/config/gencode_sources.json         | 204 +++++
 .../xrefs/config/xref_all_sources.json        |  71 +-
 .../production/xrefs/config/xref_config.ini   |  28 +-
 12 files changed, 1231 insertions(+), 1136 deletions(-)
 create mode 100644 src/python/ensembl/production/xrefs/config/gencode_sources.json

diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index cb92281a3..f1ea08be0 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -42,11 +42,11 @@
 );
 
 # Check that all mandatory parameters are passed
-if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($clean_files)) {
-  croak "Usage: cleanup_source.pl --base_path <base_path> --source_db_url <source_db_url> --name <name> --clean_dir <clean_dir> --clean_files <clean_files> [--version_file <version_file>] [--tax_ids_file <tax_ids_file>] [--update_mode <update_mode>] [--log_timestamp <log_timestamp>]";
+foreach my $param ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files) {
+  defined $param or croak "Usage: cleanup_source.pl --base_path <base_path> --source_db_url <source_db_url> --name <name> --clean_dir <clean_dir> --clean_files <clean_files> [--version_file <version_file>] [--tax_ids_file <tax_ids_file>] [--update_mode <update_mode>] [--log_timestamp <log_timestamp>]";
 }
 
-if (!defined($update_mode)) {$update_mode = 0;}
+$update_mode //= 0;
 
 my $log_file;
 if (defined($log_timestamp)) {
@@ -65,18 +65,15 @@
 }
 
 # Remove last '/' character if it exists
-if ($base_path =~ /\/$/) {chop($base_path);}
+chop($base_path) if $base_path =~ /\/$/;
 
 # Remove / char from source name to access directory
-my $clean_name = $source_name;
-$clean_name =~ s/\///g;
+(my $clean_name = $source_name) =~ s/\///g;
 
-my $output_path = $clean_dir."/".$clean_name;
+my $output_path = catdir($clean_dir, $clean_name);
 
 # Create needed directories
-if (!$update_mode) {
-  rmtree($output_path);
-}
+rmtree($output_path) unless $update_mode;
 make_path($output_path);
 
 my $sources_to_remove;
@@ -89,11 +86,7 @@
   $output_file_name = ($source_name =~ /SPTREMBL/ ? 'uniprot_trembl' : 'uniprot_sprot');
 
   # Set sources to skip in parsing step
-  my @source_names = (
-    'GO', 'UniGene', 'RGD', 'CCDS', 'IPI', 'UCSC', 'SGD', 'HGNC', 'MGI', 'VGNC', 'Orphanet',
-    'ArrayExpress', 'GenomeRNAi', 'EPD', 'Xenbase', 'Reactome', 'MIM_GENE', 'MIM_MORBID', 'MIM',
-    'Interpro'
-  );
+  my @source_names = qw(GO UniGene RGD CCDS IPI UCSC SGD HGNC MGI VGNC Orphanet ArrayExpress GenomeRNAi EPD Xenbase Reactome MIM_GENE MIM_MORBID MIM Interpro);
   $sources_to_remove = join("|", @source_names);
 } elsif ($source_name =~ /^RefSeq_dna/) {
   $is_refseq_dna = 1;
@@ -109,47 +102,45 @@
 my %tax_ids;
 my ($skipped_species, $added_species) = (0, 0);
 if ($tax_ids_file && $update_mode) {
-  open my $fh, '<', $tax_ids_file;
+  open my $fh, '<', $tax_ids_file or die "Couldn't open tax_ids_file '$tax_ids_file' $!";
   chomp(my @lines = <$fh>);
   close $fh;
   %tax_ids = map { $_ => 1 } @lines;
 
   # Check if any taxonomy IDs already have files
-  foreach my $tax_id (keys(%tax_ids)) {
-    my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id);
-    if (scalar(@tax_files) > 0) {
+  foreach my $tax_id (keys %tax_ids) {
+    my @tax_files = glob(catfile($output_path, "**", "**", "**", "**", "$output_file_name-$tax_id"));
+    if (@tax_files) {
       $tax_ids{$tax_id} = 0;
       $skipped_species++;
     }
   }
 
   # Do nothing if all taxonomy IDs already have files
-  if ($skipped_species == scalar(keys(%tax_ids))) {
+  if ($skipped_species == keys %tax_ids) {
     add_to_log_file($log_file, "All provided tax IDs already have files. Doing nothing.");
     exit;
   }
 }
 
 # Get all files for source
-my $files_path = $base_path."/".$clean_name;
-my @files = glob($files_path."/*");
+my $files_path = catdir($base_path, $clean_name);
+my @files = glob(catfile($files_path, "*"));
 my $out_fh;
 my $current_species_id;
 
 # Process each file
-foreach my $input_file_name (@files) {
-  local $/ = "//\n";
+foreach my $input_file (@files) {
+  # Skip the release file
+  next if defined($version_file) && $input_file eq $version_file;
 
-  add_to_log_file($log_file, "Splitting up file $input_file_name");
+  local $/ = "//\n";
 
-  $input_file_name = basename($input_file_name);
-  my $input_file = $files_path."/".$input_file_name;
+  add_to_log_file($log_file, "Splitting up file $input_file");
+  my $input_file_name = basename($input_file);
   my $in_fh;
 
-  # Skip the release file
-  if (defined($version_file) && $input_file eq $version_file) {next;}
-
-  # Open file normally or with zcat for zipped filed
+  # Open file normally or with zcat for zipped files
   if ($input_file_name =~ /\.(gz|Z)$/x) {
     open($in_fh, "zcat $input_file |") or die "Couldn't call 'zcat' to open input file '$input_file' $!";
     $output_file_name =~ s/\.[^.]+$//;
@@ -167,14 +158,14 @@
       my $species_id;
       if ($is_uniprot) {
         ($species_id) = $record =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
-        $species_id =~ s/\s// if $species_id;
+        $species_id =~ s/\s//g if $species_id;
       } else {
         ($species_id) = $record =~ /db_xref=.taxon:(\d+)/;
       }
 
       # Only continue with wanted species
-      next if (!$species_id);
-      next if ($tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id}));
+      next unless $species_id;
+      next if $tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id});
 
       # Clean up data
       if ($clean_files) {
@@ -205,31 +196,27 @@
               }
             }
 
-            if (!$skip_data) {
-              push(@new_record, $line);
-            }
-
-            $record = join("\n", @new_record);
+            push(@new_record, $line) unless $skip_data;
           }
+
+          $record = join("\n", @new_record);
         }
       }
 
       # Write the record in the appropriate file
       if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) {
-        close($out_fh) if (defined($current_species_id));
+        close($out_fh) if defined($current_species_id);
 
         my $species_id_str = sprintf("%04d", $species_id);
         my @digits = split('', $species_id_str);
 
-        $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]);
+        $write_path = catdir($output_path, @digits);
         make_path($write_path);
 
-        $write_file = $write_path."/".$output_file_name."-".$species_id;
+        $write_file = catfile($write_path, "$output_file_name-$species_id");
 
         # Check if creating new file
-        if (!-e $write_file) {
-          $added_species++;
-        }
+        $added_species++ unless -e $write_file;
 
         open($out_fh, '>>', $write_file) or die "Couldn't open output file '$write_file' $!";
 
@@ -240,42 +227,33 @@
     }
 
     close($in_fh);
-    close($out_fh) if $out_fh;
   }
 }
 
+close($out_fh) if $out_fh;
+
 add_to_log_file($log_file, "Source $source_name cleaned up");
 add_to_log_file($log_file, "$source_name skipped species = $skipped_species");
 add_to_log_file($log_file, "$source_name species files created = $added_species");
 
 # Save the clean files directory in source db
-my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url);
+my ($host, $port, $user, $pass, $source_db) = parse_url($source_db_url);
 my $dbi = get_dbi($host, $port, $user, $pass, $source_db);
-my $update_version_sth = $dbi->prepare("UPDATE IGNORE version set clean_uri=? where source_id=(SELECT source_id FROM source WHERE name=?)");
+my $update_version_sth = $dbi->prepare("UPDATE IGNORE version SET clean_path=? WHERE source_id=(SELECT source_id FROM source WHERE name=?)");
 $update_version_sth->execute($output_path, $source_name);
 $update_version_sth->finish();
 
 sub get_dbi {
   my ($host, $port, $user, $pass, $dbname) = @_;
-  my $dbconn;
-  if (defined $dbname) {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname);
-  } else {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
-  }
-  my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr );
+  my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
+  my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr);
   return $dbi;
 }
 
 sub parse_url {
   my ($url) = @_;
   my $parsed_url = Nextflow::Utils::parse($url);
-  my $user = $parsed_url->{'user'};
-  my $pass = $parsed_url->{'pass'};
-  my $host = $parsed_url->{'host'};
-  my $port = $parsed_url->{'port'};
-  my $db   = $parsed_url->{'dbname'};
-  return ($user, $pass, $host, $port, $db);
+  return @{$parsed_url}{qw(host port user pass dbname)};
 }
 
 sub add_to_log_file {
@@ -284,8 +262,8 @@ sub add_to_log_file {
   if (defined($log_file)) {
     my $current_timestamp = strftime "%d-%b-%Y %H:%M:%S", localtime;
 
-    open(my $fh, '>>', $log_file);
+    open(my $fh, '>>', $log_file) or die "Couldn't open log file '$log_file' $!";
     print $fh "$current_timestamp | INFO | $message\n";
     close($fh);
   }
-}
\ No newline at end of file
+}
diff --git a/scripts/xrefs/cleanup_source.pl b/scripts/xrefs/cleanup_source.pl
index 1226e6e1c..07b330717 100644
--- a/scripts/xrefs/cleanup_source.pl
+++ b/scripts/xrefs/cleanup_source.pl
@@ -37,8 +37,8 @@
 );
 
 # Check that all mandatory parameters are passed
-if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($skip_download) || !defined($clean_files)) {
-  croak "Usage: cleanup_source.pl --base_path <base_path> --source_db_url <source_db_url> --name <name> --clean_dir <clean_dir> --skip_download <skip_download> --clean_files <clean_files> [--version_file <version_file>] [--log_timestamp <log_timestamp>]";
+foreach my $param ($base_path, $source_db_url, $source_name, $clean_dir, $skip_download, $clean_files) {
+  defined $param or croak "Usage: cleanup_source.pl --base_path <base_path> --source_db_url <source_db_url> --name <name> --clean_dir <clean_dir> --skip_download <skip_download> --clean_files <clean_files> [--version_file <version_file>] [--log_timestamp <log_timestamp>]";
 }
 
 my $log_file;
@@ -50,176 +50,160 @@
   add_to_log_file($log_file, "CleanupSource starting for source $source_name");
 }
 
-# Do nothing if not cleaning files, not a uniprot or refseq source, or no new download
-if ($clean_files && ($source_name =~ /^Uniprot/ || $source_name =~ /^RefSeq_/)) {
-  # Remove last '/' character if it exists
-  if ($base_path =~ /\/$/) {chop($base_path);}
+# Do nothing if not cleaning files or if not a uniprot or refseq source
+if (!$clean_files || ($source_name !~ /^Uniprot/ && $source_name !~ /^RefSeq_/)) {
+  add_to_log_file($log_file, "Provided source name is invalid. Can only clean up and split Uniprot or RefSeq files.");
+  exit;
+}
 
-  # Remove / char from source name to access directory
-  my $clean_name = $source_name;
-  $clean_name =~ s/\///g;
+# Remove last '/' character if it exists
+chop($base_path) if $base_path =~ /\/$/;
 
-  my $output_path = $clean_dir."/".$clean_name;
-  my $update_clean_uri = 0;
+# Remove / char from source name to access directory
+(my $clean_name = $source_name) =~ s/\///g;
 
-  # If not a new download, check if clean files exist
-  if ($skip_download) {
-    if (-d $output_path) {
-      $update_clean_uri = 1
-    }
-  } else {
-    # Create needed directories
-    make_path($output_path);
+my $output_path = catdir($clean_dir, $clean_name);
+my $update_clean_uri = 0;
 
+# If not a new download, check if clean files exist
+if ($skip_download) {
+  if (-d $output_path) {
     $update_clean_uri = 1;
+  }
+} else {
+  # Create needed directories
+  make_path($output_path);
+  $update_clean_uri = 1;
+
+  my $sources_to_remove;
+  my ($is_uniprot, $is_refseq_dna, $is_refseq_peptide) = (0, 0, 0);
+  my $file_size = 0;
+
+  # Set sources to skip in parsing step (uniprot only)
+  if ($source_name =~ /^Uniprot/) {
+    $is_uniprot = 1;
+    my @source_names = qw(GO UniGene RGD CCDS IPI UCSC SGD HGNC MGI VGNC Orphanet ArrayExpress GenomeRNAi EPD Xenbase Reactome MIM_GENE MIM_MORBID MIM Interpro);
+    $sources_to_remove = join("|", @source_names);
+    $file_size = 200000;
+  } elsif ($source_name =~ /^RefSeq_dna/) {
+    $is_refseq_dna = 1;
+  } elsif ($source_name =~ /^RefSeq_peptide/) {
+    $is_refseq_peptide = 1;
+  } else {
+    croak "Unknown file type $source_name";
+  }
 
-    my $sources_to_remove;
-    my ($is_uniprot, $is_refseq_dna, $is_refseq_peptide) = (0, 0, 0);
-    my $file_size = 0;
-
-    # Set sources to skip in parsing step (uniprot only)
-    if ($source_name =~ /^Uniprot/) {
-      $is_uniprot = 1;
-      my @source_names = (
-        'GO', 'UniGene', 'RGD', 'CCDS', 'IPI', 'UCSC', 'SGD', 'HGNC', 'MGI', 'VGNC', 'Orphanet',
-        'ArrayExpress', 'GenomeRNAi', 'EPD', 'Xenbase', 'Reactome', 'MIM_GENE', 'MIM_MORBID', 'MIM',
-        'Interpro'
-      );
-      $sources_to_remove = join("|", @source_names);
-      $file_size = 200000;
-    } elsif ($source_name =~ /^RefSeq_dna/) {
-      $is_refseq_dna = 1;
-    } elsif ($source_name =~ /^RefSeq_peptide/) {
-      $is_refseq_peptide = 1;
-    } else {
-      croak "Unknown file type $source_name";
-    }
+  # Get all files for source
+  my $files_path = catdir($base_path, $clean_name);
+  my @files = glob(catfile($files_path, "*"));
 
-    # Get all files for source
-    my $files_path = $base_path."/".$clean_name;
-    my @files = `ls $files_path`;
-    foreach my $file_name (@files) {
-      $file_name =~ s/\n//;
-      my $file = $files_path."/".$file_name;
+  # Process each file
+  foreach my $input_file (@files) {
+    # Skip the release file
+    next if defined($version_file) && $input_file eq $version_file;
 
-      # Skip the release file
-      if (defined($version_file) && $file eq $version_file) {next;}
+    add_to_log_file($log_file, "Cleaning up file $input_file");
+    my ($in_fh, $out_fh);
+    my $input_file_name = basename($input_file);
+    my $output_file = $input_file_name;
 
-      my ($in_fh, $out_fh);
-      my $output_file = $file_name;
+    # Open file normally or with zcat for zipped files
+    if ($input_file_name =~ /\.(gz|Z)$/x) {
+      open($in_fh, "zcat $input_file |") or die "Couldn't call 'zcat' to open input file '$input_file' $!";
+      $output_file =~ s/\.[^.]+$//;
+    } else {
+      open($in_fh, '<', $input_file) or die "Couldn't open file input '$input_file' $!";
+    }
 
-      # Open file normally or with zcat for zipped filed
-      if ($file_name =~ /\.(gz|Z)$/x) {
-        open($in_fh, "zcat $file |")
-          or die "Couldn't call 'zcat' to open input file '$file' $!";
+    # Only start cleaning up if could get filehandle
+    my $count = 0;
+    my $file_count = 1;
+    if (defined($in_fh)) {
+      if ($is_uniprot) {
+        local $/ = "//\n";
+
+        my $write_file = catfile($output_path, "$output_file-$file_count");
+        open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!";
+
+        # Read full records
+        while (my $record = $in_fh->getline()) {
+          # Remove unused data
+          $record =~ s/\nR(N|P|X|A|T|R|L|C|G)\s{3}.*//g; # Remove references lines
+          $record =~ s/\nCC(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCT$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set specific caution comment to temporary
+          $record =~ s/\nCC\s{3}.*//g; # Remove comments
+          $record =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment
+          $record =~ s/\nFT\s{3}.*//g; # Remove feature coordinates
+          $record =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing
+
+          # Added lines that we do need into output
+          print $out_fh $record;
+
+          # Check how many lines have been processed and write to new file if size exceeded
+          $count++;
+          if ($count > $file_size) {
+            close($out_fh);
+            $file_count++;
+            $write_file = catfile($output_path, "$output_file-$file_count");
+            open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!";
+            $count = 0;
+          }
+        }
 
-        $output_file =~ s/\.[^.]+$//;
+        close($in_fh);
+        close($out_fh);
       } else {
-        open($in_fh, '<', $file)
-          or die "Couldn't open file input '$file' $!";
-      }
-
-      # Only start cleaning up if could get filehandle
-      my $count = 0;
-      my $file_count = 1;
-      if (defined($in_fh)) {
-        if ($is_uniprot) {
-          local $/ = "//\n";
-
-          my $write_file = $output_path."/".$output_file . "-$file_count";
-          open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!";
-
-          # Read full records
-          while ($_ = $in_fh->getline()) {
-            # Remove unused data
-            $_ =~ s/\nR(N|P|X|A|T|R|L|C|G)\s{3}.*//g; # Remove references lines
-            $_ =~ s/\nCC(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCT$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set specific caution comment to temporary
-            $_ =~ s/\nCC\s{3}.*//g; # Remove comments
-            $_ =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment
-            $_ =~ s/\nFT\s{3}.*//g; # Remove feature coordinates
-            $_ =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing
-
-            # Added lines that we do need into output
-            print $out_fh $_;
-
-            # Check how many lines have been processed and write to new file if size exceeded
-            $count++;
-            if ($count > $file_size) {
-              close($out_fh);
-              $file_count++;
-              $write_file = $output_path."/".$output_file . "-$file_count";
-              open($out_fh, '>', $write_file)
-                or die "Couldn't open output file '$write_file' $!";
-              $count = 0;
+        $output_file = catfile($output_path, $output_file);
+        open($out_fh, '>', $output_file) or die "Couldn't open output file '$output_file' $!";
+
+        # Remove unused data
+        my $skip_data = 0;
+        while (my $line = <$in_fh>) {
+          if ($is_refseq_dna) {
+            if ($line =~ /^REFERENCE/ || $line =~ /^COMMENT/ || $line =~ /^\s{5}exon/ || $line =~ /^\s{5}misc_feature/ || $line =~ /^\s{5}variation/) {
+              $skip_data = 1;
+            } elsif ($line =~ /^\s{5}source/ || $line =~ /^ORIGIN/) {
+              $skip_data = 0;
             }
-          }
-
-          close($in_fh);
-          close($out_fh);
-        } else {
-          $output_file = $output_path."/".$output_file;
-          open($out_fh, '>', $output_file) or die "Couldn't open output file '$output_file' $!";
-
-          # Remove unuused data
-          my $skip_data = 0;
-          while (<$in_fh>) {
-            if ($is_refseq_dna) {
-              if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}exon/ || $_ =~ /^\s{5}misc_feature/ || $_ =~ /^\s{5}variation/) {
-                $skip_data = 1;
-              } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^ORIGIN/) {
-                $skip_data = 0;
-              }
-            } elsif ($is_refseq_peptide) {
-              if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}Protein/) {
-                $skip_data = 1;
-              } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^\s{5}CDS/ || $_ =~ /^ORIGIN/) {
-                $skip_data = 0;
-              }
+          } elsif ($is_refseq_peptide) {
+            if ($line =~ /^REFERENCE/ || $line =~ /^COMMENT/ || $line =~ /^\s{5}Protein/) {
+              $skip_data = 1;
+            } elsif ($line =~ /^\s{5}source/ || $line =~ /^\s{5}CDS/ || $line =~ /^ORIGIN/) {
+              $skip_data = 0;
             }
-
-            if (!$skip_data) {print $out_fh $_;}
           }
 
-          close($in_fh);
-          close($out_fh);
+          print $out_fh $line unless $skip_data;
         }
+
+        close($in_fh);
+        close($out_fh);
       }
     }
-
-    add_to_log_file($log_file, "Source $source_name cleaned up");
   }
 
-  # Save the clean files directory in source db
-  if ($update_clean_uri) {
-    my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url);
-    my $dbi = get_dbi($host, $port, $user, $pass, $source_db);
-    my $update_version_sth = $dbi->prepare("UPDATE IGNORE version set clean_uri=? where source_id=(SELECT source_id FROM source WHERE name=?)");
-    $update_version_sth->execute($output_path, $source_name);
-    $update_version_sth->finish();
-  }
+  add_to_log_file($log_file, "Source $source_name cleaned up");
+}
+
+# Save the clean files directory in source db
+if ($update_clean_uri) {
+  my ($host, $port, $user, $pass, $source_db) = parse_url($source_db_url);
+  my $dbi = get_dbi($host, $port, $user, $pass, $source_db);
+  my $update_version_sth = $dbi->prepare("UPDATE IGNORE version SET clean_path=? WHERE source_id=(SELECT source_id FROM source WHERE name=?)");
+  $update_version_sth->execute($output_path, $source_name);
+  $update_version_sth->finish();
 }
 
 sub get_dbi {
   my ($host, $port, $user, $pass, $dbname) = @_;
-  my $dbconn;
-  if (defined $dbname) {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname);
-  } else {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
-  }
-  my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr );
+  my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
+  my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr);
   return $dbi;
 }
 
 sub parse_url {
   my ($url) = @_;
   my $parsed_url = Nextflow::Utils::parse($url);
-  my $user = $parsed_url->{'user'};
-  my $pass = $parsed_url->{'pass'};
-  my $host = $parsed_url->{'host'};
-  my $port = $parsed_url->{'port'};
-  my $db   = $parsed_url->{'dbname'};
-  return ($user, $pass, $host, $port, $db);
+  return @{$parsed_url}{qw(host port user pass dbname)};
 }
 
 sub add_to_log_file {
@@ -228,7 +212,7 @@ sub add_to_log_file {
   if (defined($log_file)) {
     my $current_timestamp = strftime "%d-%b-%Y %H:%M:%S", localtime;
 
-    open(my $fh, '>>', $log_file);
+    open(my $fh, '>>', $log_file) or die "Couldn't open log file '$log_file' $!";
     print $fh "$current_timestamp | INFO | $message\n";
     close($fh);
   }
diff --git a/src/python/ensembl/common/Params.py b/src/python/ensembl/common/Params.py
index b7a163a14..9d1a7b05f 100644
--- a/src/python/ensembl/common/Params.py
+++ b/src/python/ensembl/common/Params.py
@@ -18,14 +18,14 @@
 import re
 import json
 import argparse
+import os
 
-from typing import Dict, Any
+from typing import Dict, Any, Optional, Type
 
 sys.tracebacklimit = 0
 
-
 class Params:
-    def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None:
+    def __init__(self, params: Optional[Dict[str, Any]] = None, parse_dataflow_json: bool = True) -> None:
         """Params constructor.
 
         Parameters
@@ -35,13 +35,8 @@ def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = Tr
         parse_dataflow_json: bool, optional
             Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
         """
-        if params is None:
-            params = {}
-
-        if params:
-            self._params = params
-        else:
-            self._params = {}
+        self._params = params if params is not None else {}
+        if not params:
             self.parse_argv_params(parse_dataflow_json)
 
     def parse_argv_params(self, parse_dataflow_json: bool = True) -> None:
@@ -56,8 +51,7 @@ def parse_argv_params(self, parse_dataflow_json: bool = True) -> None:
         args = sys.argv[1:]
 
         # Extract param names from command line
-        r = re.compile(r"^--")
-        param_names = list(filter(r.match, args))
+        param_names = [arg for arg in args if arg.startswith("--")]
 
         parser = argparse.ArgumentParser()
         for name in param_names:
@@ -68,160 +62,240 @@ def parse_argv_params(self, parse_dataflow_json: bool = True) -> None:
             if param_name == "dataflow" and parse_dataflow_json:
                 dataflow_params = json.loads(getattr(params, param_name))
                 for name, value in dataflow_params.items():
-                    self.param(name, value)
+                    self.set_param(name, value)
             else:
-                self.param(param_name, getattr(params, param_name))
-
-    def param(self, name: str, new_value: Any = None, options: Dict[str, Any] = None) -> Any:
-        """Gets or sets a parameter value.
+                self.set_param(param_name, getattr(params, param_name))
 
-        Parameters
-        ----------
-        name: str
-            The name of the paramater
-        new_value: any, optional
-            The value to set the parameter to (default is None)
-        options: dict, optional
-            Extra options, including:
-            - default: The default value to use if parameter has no value (sets the parameter value to this)
-            - type: The type of the parameter value, used to check if value is valid
-
-        Returns
-        -------
-        The value of the parameter with provided name.
-
-        Raises
-        ------
-        AttributeError
-            If no parameter name was passed.
-        """
+    def get_param(self, name: str, options: Optional[Dict[str, Any]] = None) -> Any:
         if not name:
             raise AttributeError("You must supply a parameter name")
         if options is None:
             options = {}
 
-        value = None
-
-        if new_value is not None:
-            self._params[name] = new_value
-            value = new_value
-        else:
-            value = self._params.get(name)
-            if value is None and options.get("default") is not None:
-                default = options["default"]
-                self._params[name] = default
-                value = default
-
-        if options.get("type"):
-            return self.check_type(name, value, options["type"])
-
-        return value
-
-    def param_required(self, name: str, options: Dict[str, Any] = None) -> Any:
-        """Gets a parameter value, raising an error if no value is found.
+        value = self._params.get(name)
+        if value is None:
+            if "default" in options:
+                value = options["default"]
+            elif "required" in options and options["required"]:
+                raise AttributeError(f"Parameter '{name}' is required but has no value")
 
-        Parameters
-        ----------
-        name: str
-            The name of th parameter
-        options: dict, optional
-            Extra options, including:
-            - default: The default value to use if parameter has no value (sets the parameter value to this)
-            - type: The type of the parameter value, used to check if value is valid
+        return self.set_param(name, value, options)
 
-        Returns
-        -------
-        The value of the parameter with provided name.
+    def set_param(self, name: str, value: Any, options: Optional[Dict[str, Any]] = None) -> Any:
+        if not name:
+            raise AttributeError("You must supply a parameter name")
+        if options is None:
+            options = {}
 
-        Raises
-        ------
-        AttributeError
-            If no value is found for the required paramater.
-        """
-        value = self.param(name, None, options)
+        if "type" in options:
+            value = self.check_type(name, value, options["type"])
 
-        if value is None:
-            raise AttributeError(f"Parameter '{name}' is required but has no value")
+        self._params[name] = value
 
         return value
 
-    def check_type(self, name: str, value: Any, value_type: str) -> Any:
-        """Checks if the parameter value provided is valid.
-        For specific types, this function can change the parameter value.
+    # def param(self, name: str, new_value: Any = None, options: Optional[Dict[str, Any]] = None) -> Any:
+    #     """Gets or sets a parameter value.
+
+    #     Parameters
+    #     ----------
+    #     name: str
+    #         The name of the parameter
+    #     new_value: any, optional
+    #         The value to set the parameter to (default is None)
+    #     options: dict, optional
+    #         Extra options, including:
+    #         - default: The default value to use if parameter has no value (sets the parameter value to this)
+    #         - type: The type of the parameter value, used to check if value is valid
+
+    #     Returns
+    #     -------
+    #     The value of the parameter with provided name.
+
+    #     Raises
+    #     ------
+    #     AttributeError
+    #         If no parameter name was passed.
+    #     """
+    #     if not name:
+    #         raise AttributeError("You must supply a parameter name")
+    #     if options is None:
+    #         options = {}
+
+    #     if new_value is not None:
+    #         self._params[name] = new_value
+    #         value = new_value
+    #     else:
+    #         value = self._params.get(name)
+    #         if value is None and "default" in options:
+    #             value = options["default"]
+    #             self._params[name] = value
+
+    #     if "type" in options:
+    #         return self.check_type(name, value, options["type"])
+
+    #     return value
+
+    # def param_required(self, name: str, options: Optional[Dict[str, Any]] = None) -> Any:
+    #     """Gets a parameter value, raising an error if no value is found.
+
+    #     Parameters
+    #     ----------
+    #     name: str
+    #         The name of the parameter
+    #     options: dict, optional
+    #         Extra options, including:
+    #         - default: The default value to use if parameter has no value (sets the parameter value to this)
+    #         - type: The type of the parameter value, used to check if value is valid
+
+    #     Returns
+    #     -------
+    #     The value of the parameter with provided name.
+
+    #     Raises
+    #     ------
+    #     AttributeError
+    #         If no value is found for the required parameter.
+    #     """
+    #     value = self.param(name, None, options)
+
+    #     if value is None:
+    #         raise AttributeError(f"Parameter '{name}' is required but has no value")
+
+    #     return value
+
+    def check_type(self, name: str, value: Any, value_type: Type) -> Any:
+        """Checks if the parameter value is of the expected type and attempts conversion if necessary.
 
         Parameters
         ----------
         name: str
             The name of the parameter
-        value: any
+        value: Any
             The value of the parameter
-        value_type: str
-            The type of the parameter value. Accepted types:
-            - hash, dict, or dictionary
-            - array or list
-            - int or integer
-            - bool or boolean
-            - str or string
+        value_type: Type
+            The expected type of the parameter (e.g., `int`, `str`, `bool`)
 
         Returns
         -------
-        None if no value is found, or the new value of the parameter with provided name.
+        The value of the parameter with provided name, converted to the correct type if necessary.
 
         Raises
         ------
         AttributeError
-            If no parameter name is provided.
-            If parameter value is not valid.
+            If the parameter name is missing or the value cannot be converted to the specified type.
         """
         if not name:
             raise AttributeError("You must supply a parameter name")
         if value is None:
             return
 
-        value_type = value_type.lower()
-        error, update = False, True
-        new_value = None
-
-        if value_type in ["hash", "dict", "dictionary"] and not isinstance(value, dict):
-            error = True
-        elif value_type in ["array", "list"] and not isinstance(value, list):
-            # Try to split by commas
-            if re.search(",", value):
-                new_value = value.split(",")
-            else:
-                new_value = [value]
-        elif value_type in ["int", "integer"] and not isinstance(value, int):
-            # Try to make it an integer
-            try:
-                new_value = int(value)
-            except ValueError:
-                error = True
-        elif value_type in ["bool", "boolean"] and not isinstance(value, bool):
-            # Try to make it a boolean
+        # Special cases first
+        if value_type is list:
+            if isinstance(value, str):
+                # Split the string by commas if present, otherwise wrap it in a list
+                value = re.sub(r"\s*,\s*", ",", value)
+                value = value.split(",") if "," in value else [value]
+            elif not isinstance(value, list):
+                # If value is not a list and not a string, raise an error
+                raise AttributeError(f"Parameter '{name}' has an invalid value '{value}'. Expected type list")
+        elif value_type is bool:
             if isinstance(value, int):
-                new_value = bool(value)
-            elif isinstance(value, str) and value in ["True", "False"]:
-                new_value = bool(value)
-            elif value in ["0", "1", 0, 1]:
-                new_value = bool(int(value))
-            else:
-                error = True
-        elif value_type in ["str", "string"] and not isinstance(value, str):
-            new_value = str(value)
-        else:
-            update = False
-
-        if error:
-            raise AttributeError(
-                f"Parameter '{name}' has an invalid value '{value}'. Must be of type {value_type}"
-            )
-
-        if update:
-            self.param(name, new_value)
-            value = new_value
+                value = bool(value)
+            elif isinstance(value, str):
+                if value in ["True", "False"]:
+                    value = value == "True"
+                elif value in ["0", "1"]:
+                    value = bool(int(value))
+            elif not isinstance(value, bool):
+                raise AttributeError(f"Parameter '{name}' has an invalid value '{value}'. Expected type bool")
+
+        # General type checking for other types
+        # if not isinstance(value, value_type):
+        try:
+            value = value_type(value)  # Attempt conversion
+        except (ValueError, TypeError):
+            raise AttributeError(f"Parameter '{name}' has an invalid value '{value}'. Expected type {value_type.__name__}")
 
         return value
 
+    # def check_type(self, name: str, value: Any, value_type: str) -> Any:
+    #     """Checks if the parameter value provided is valid.
+    #     For specific types, this function can change the parameter value.
+
+    #     Parameters
+    #     ----------
+    #     name: str
+    #         The name of the parameter
+    #     value: any
+    #         The value of the parameter
+    #     value_type: str
+    #         The type of the parameter value. Accepted types:
+    #         - hash, dict, or dictionary
+    #         - array or list
+    #         - int or integer
+    #         - bool or boolean
+    #         - str or string
+
+    #     Returns
+    #     -------
+    #     None if no value is found, or the new value of the parameter with provided name.
+
+    #     Raises
+    #     ------
+    #     AttributeError
+    #         If no parameter name is provided.
+    #         If parameter value is not valid.
+    #     """
+    #     if not name:
+    #         raise AttributeError("You must supply a parameter name")
+    #     if value is None:
+    #         return
+
+    #     value_type = value_type.lower()
+    #     error, update = False, True
+    #     new_value = None
+
+    #     if value_type in ["hash", "dict", "dictionary"] and not isinstance(value, dict):
+    #         error = True
+    #     elif value_type in ["array", "list"] and not isinstance(value, list):
+    #         # Try to split by commas
+    #         if isinstance(value, str) and "," in value:
+    #             new_value = value.split(",")
+    #         else:
+    #             new_value = [value]
+    #     elif value_type in ["int", "integer"] and not isinstance(value, int):
+    #         # Try to make it an integer
+    #         try:
+    #             new_value = int(value)
+    #         except ValueError:
+    #             error = True
+    #     elif value_type in ["bool", "boolean"] and not isinstance(value, bool):
+    #         # Try to make it a boolean
+    #         if isinstance(value, int):
+    #             new_value = bool(value)
+    #         elif isinstance(value, str) and value in ["True", "False"]:
+    #             new_value = value == "True"
+    #         elif value in ["0", "1", 0, 1]:
+    #             new_value = bool(int(value))
+    #         else:
+    #             error = True
+    #     elif value_type in ["str", "string"] and not isinstance(value, str):
+    #         new_value = str(value)
+    #     else:
+    #         update = False
+
+    #     if error:
+    #         raise AttributeError(
+    #             f"Parameter '{name}' has an invalid value '{value}'. Must be of type {value_type}"
+    #         )
+
+    #     if update:
+    #         self.param(name, new_value)
+    #         value = new_value
+
+    #     return value
+
     def write_output(self, suffix: str, params: Dict[str, Any]) -> None:
         """Appends data to the dataflow json file (passed into next pipeline process).
 
@@ -233,11 +307,19 @@ def write_output(self, suffix: str, params: Dict[str, Any]) -> None:
             The data to append into the file
         """
         # Remove null params
-        params = {k: v for k, v in params.items() if v is not None}
+        output_params = {k: v for k, v in params.items() if v is not None}
 
-        with open(f"dataflow_{suffix}.json", "a") as fh:
-            json.dump(params, fh)
-            fh.write("\n")
+        dataflow_file = f"dataflow_{suffix}.json"
+        dataflow_output_path = self.get_param("dataflow_output_path", {"type": str})
+        if dataflow_output_path:
+            dataflow_file = os.path.join(dataflow_output_path, dataflow_file)
+
+        with open(dataflow_file, "a") as fh:
+            if output_params:
+                json.dump(output_params, fh)
+                fh.write("\n")
+            else:
+                fh.write("")
 
     def write_all_output(self, suffix: str) -> None:
         """Appends all of the parameters in the object into the dataflow json file.
diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py
index 3a59abfc0..04aad4971 100644
--- a/src/python/ensembl/production/xrefs/Base.py
+++ b/src/python/ensembl/production/xrefs/Base.py
@@ -21,119 +21,72 @@
 import fnmatch
 import gzip
 import importlib
-import wget
+import wget # type: ignore
 import threading
-import json
 import logging
-import time
 import random
-import csv
-import subprocess
-import unicodedata
 
-from sqlalchemy import create_engine, select, insert, update, text, func, and_, delete
-from sqlalchemy.engine.url import make_url, URL
+from sqlalchemy import create_engine, select, text
+from sqlalchemy.dialects.mysql import insert
+from sqlalchemy.engine.url import make_url
 from sqlalchemy.engine import Engine, Connection
-from sqlalchemy.orm import aliased
 from sqlalchemy_utils import database_exists, create_database, drop_database
 from urllib.parse import urlparse
 from ftplib import FTP
 from itertools import groupby
 from configparser import ConfigParser
 from datetime import datetime
-from pyspark import SparkConf
-from pyspark.sql import SparkSession
 from typing import IO, List, Dict, Any, Iterator, Optional
 
 from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 from ensembl.xrefs.xref_source_db_model import (
     Base as XrefSourceDB,
-    Source as SourceSORM,
-    Version as VersionORM,
-    ChecksumXref as ChecksumXrefSORM,
+    Source as SourceSORM
 )
 
 from ensembl.xrefs.xref_update_db_model import (
     Base as XrefUpdateDB,
     Source as SourceUORM,
     SourceURL as SourceURLORM,
-    Xref as XrefUORM,
-    PrimaryXref as PrimaryXrefORM,
-    DependentXref as DependentXrefUORM,
-    CoordinateXref as CoordinateXrefORM,
-    GeneDirectXref as GeneDirectXrefORM,
-    TranscriptDirectXref as TranscriptDirectXrefORM,
-    TranslationDirectXref as TranslationDirectXrefORM,
-    Synonym as SynonymORM,
-    Pairs as PairsORM,
-    Species as SpeciesORM,
-    MappingJobs as MappingJobsORM,
-    Mapping as MappingORM,
+    Species as SpeciesORM
 )
 
-from ensembl.core.models import (
-    Meta as MetaCORM,
-    Analysis as AnalysisORM,
-    AnalysisDescription as AnalysisDescriptionORM,
-    SeqRegion as SeqRegionORM,
-    CoordSystem as CoordSystemORM,
-    Dna as DnaORM,
-    Gene as GeneORM,
-    Transcript as TranscriptORM,
-    Translation as TranslationORM,
-    Exon as ExonORM,
-    ExonTranscript as ExonTranscriptORM,
-    SupportingFeature as SupportingFeatureORM,
-    DnaAlignFeature as DnaAlignFeatureORM,
-    AttribType as AttribTypeORM,
-    TranscriptAttrib as TranscriptAttribORM,
-    SeqRegionAttrib as SeqRegionAttribORM,
-    Xref as XrefCORM,
-    DependentXref as DependentXrefCORM,
-    ExternalDb as ExternalDbORM,
-    ObjectXref as ObjectXrefCORM,
-)
+from ensembl.core.models import Meta as MetaCORM
 
 from ensembl.common.Params import Params
 
-
 class Base(Params):
     """Class to represent the base of xref modules. Inherits the Params class."""
 
-    def __init__(self, params: Dict[str, Any] = None, parse_dataflow_json: bool = True) -> None:
-        """Calls the parent __init__ then sets some specific parameters.
+    def __init__(self, params: Optional[Dict[str, Any]] = None, parse_dataflow_json: Optional[bool] = True, testing: bool = False) -> None:
+        """
+        Initialize the Base class with specific parameters.
 
         Parameters
         ----------
-        params: dict, optional
-            The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None)
-        parse_dataflow_json: bool, optional
-            Specifies whether to parse an option called 'dataflow' in the provided options (default is True)
+        params: Optional[Dict[str, Any]]
+            Initial parameters for the object. If provided, command-line parameters will not be parsed (default is None).
+        parse_dataflow_json: Optional[bool]
+            Whether to parse an option called 'dataflow' in the provided options (default is True).
         """
         super().__init__(params, parse_dataflow_json)
 
-        self.param(
-            "metasearch_url", "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch"
-        )
+        self.set_param("metasearch_url", "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch")
 
         # Initialize the logfile for this run (except for the Alignment module)
         module_name = self.__class__.__name__
-        if module_name != "Alignment":
-            if self.param("log_timestamp"):
-                current_timestamp = self.param("log_timestamp")
-            else:
-                current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        if module_name != "Alignment" and not testing:
+            current_timestamp = self.get_param("log_timestamp", {"default": datetime.now().strftime("%Y%m%d_%H%M%S"), "type": str})
 
             log_path = os.path.join(
-                self.param_required("base_path"), "logs", current_timestamp
+                self.get_param("base_path", {"required": True}), "logs", current_timestamp
             )
-            if not os.path.exists(log_path):
-                os.makedirs(log_path, exist_ok=True)
+            os.makedirs(log_path, exist_ok=True)
 
             log_file = os.path.join(
                 log_path,
-                "tmp_logfile_" + module_name + "_" + str(random.randint(0, 5000)),
+                f"tmp_logfile_{module_name}_{random.randint(0, 5000)}",
             )
             self._log_file = log_file
 
@@ -155,43 +108,49 @@ def create_source_db(self, source_url: str, reuse_db_if_present: bool) -> None:
         Parameters
         ----------
         source_url: str
-            The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]
+            The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname].
         reuse_db_if_present: bool
-            If set to False, the database defined by provided URL will be dropped before creating a new one
+            If set to True, the existing database will be reused if present.
         """
         url = make_url(source_url)
         engine = create_engine(url, isolation_level="AUTOCOMMIT")
 
-        if url.database and reuse_db_if_present:
+        if reuse_db_if_present and database_exists(engine.url):
+            logging.info(f"Database {url.database} already exists and reuse_db_if_present is True. Skipping creation.")
             return
 
         if database_exists(engine.url):
+            logging.info(f"Dropping existing database {url.database}.")
             drop_database(engine.url)
+        
+        logging.info(f"Creating new database {url.database}.")
         create_database(engine.url)
         XrefSourceDB.metadata.create_all(engine)
+        logging.info(f"Database {url.database} created successfully.")
 
     def download_file(self, file: str, base_path: str, source_name: str, extra_args: Dict[str, Any]) -> str:
-        """Downloads an xref file and saves into provided space.
+        """Downloads an xref file and saves it into the provided space.
 
         Parameters
         ----------
         file: str
-            The URL of the file to download. Acceptable URL schemes: ftp, http, and https
+            The URL of the file to download. Acceptable URL schemes: ftp, http, and https.
         base_path: str
-            The path to save the downloaded file into
+            The path to save the downloaded file into.
         source_name: str
-            The xref source name
-        extra_args: dict
+            The xref source name.
+        extra_args: Dict[str, Any]
             Extra options, including:
-            - skip_download_if_file_present: If set to True, file is only downloaded if does not exist
-            - db: The type of external db for the xref source (only relevent here if equal to 'checksum')
-            - release: If set to 'version', then this is a version file download
-            - rel_number: The URL used to retrieve the release number (only for RefSeq)
-            - catalog: The URL used to retrieve the release catalog (only for RefSeq)
+            - skip_download_if_file_present: If set to True, file is only downloaded if it does not exist.
+            - db: The type of external db for the xref source (only relevant here if equal to 'checksum').
+            - release: If set to 'version', then this is a version file download.
+            - rel_number: The URL used to retrieve the release number (only for RefSeq).
+            - catalog: The URL used to retrieve the release catalog (only for RefSeq).
 
         Returns
         -------
-        The path of the downloaded file.
+        str
+            The path of the downloaded file.
 
         Raises
         ------
@@ -200,63 +159,46 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args:
         AttributeError
             If file URL scheme is invalid.
         """
-        # Create uri object and get scheme
         uri = urlparse(file)
         if not uri.scheme:
             return file
 
-        # Get extra parameters
-        skip_download_if_file_present = (
-            extra_args.get("skip_download_if_file_present") or False
-        )
+        skip_download_if_file_present = extra_args.get("skip_download_if_file_present", False)
         db = extra_args.get("db")
         release = extra_args.get("release")
         rel_number = extra_args.get("rel_number")
         catalog = extra_args.get("catalog")
 
-        # Create file download path
-        orig_source_name = source_name
-        source_name = re.sub(r"\/", "", source_name)
-        dest_dir = os.path.join(base_path, source_name)
-        if db and db == "checksum":
-            dest_dir = os.path.join(base_path, "Checksum")
-        if not os.path.exists(dest_dir):
-            os.makedirs(dest_dir, exist_ok=True)
+        source_name_clean = re.sub(r"\/", "", source_name)
+        dest_dir = os.path.join(base_path, "Checksum" if db == "checksum" else source_name_clean)
+        os.makedirs(dest_dir, exist_ok=True)
 
-        file_path = ""
+        def download_via_http(file_url: str, dest_path: str) -> None:
+            if not os.path.exists(dest_path) or not skip_download_if_file_present:
+                if os.path.exists(dest_path):
+                    os.remove(dest_path)
+                wget.download(file_url, dest_path)
+                logging.info(f"{source_name} file downloaded via HTTP: {dest_path}")
+            else:
+                logging.info(f"{source_name} file already exists, skipping download ({dest_path})")
 
         # If file is in local ftp, copy from there
         if re.search("ftp.ebi.ac.uk", file):
-            # Construct local path
-            local_file = file
-            local_file = re.sub(
-                "https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file
-            )
-
-            # Check if local file exists
+            local_file = re.sub("https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", file)
             if os.path.exists(local_file):
                 file_path = os.path.join(dest_dir, os.path.basename(uri.path))
-                if db and db == "checksum":
-                    file_path = os.path.join(
-                        dest_dir, f"{source_name}-{os.path.basename(uri.path)}"
-                    )
+                if db == "checksum":
+                    file_path = os.path.join(dest_dir, f"{source_name_clean}-{os.path.basename(uri.path)}")
 
                 if not (skip_download_if_file_present and os.path.exists(file_path)):
                     shutil.copy(local_file, file_path)
 
                     # Check if copy was successful
                     if os.path.exists(file_path):
-                        logging.info(
-                            f"{orig_source_name} file copied from local FTP: {file_path}"
-                        )
-                        # if release:
-                        #   return file_path
-                        # return os.path.dirname(file_path)
+                        logging.info(f"{source_name} file copied from local FTP: {file_path}")
                         return file_path
                 else:
-                    logging.info(
-                        f"{orig_source_name} file already exists, skipping download ({file_path})"
-                    )
+                    logging.info(f"{source_name} file already exists, skipping download ({file_path})")
 
         # Handle Refseq files
         if re.search("RefSeq", source_name) and rel_number and catalog and not release:
@@ -267,73 +209,31 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args:
 
             # Get list of files in release catalog
             catalog = re.sub(r"\*", str(release_number), catalog)
-            files_list = requests.get(catalog).text
-            refseq_files = files_list.split("\n")
+            refseq_files = requests.get(catalog).text.split("\n")
             files_to_download = []
 
-            # Download each refseq file
             for refseq_file in refseq_files:
-                if not refseq_file:
-                    continue
-                checksum, filename = refseq_file.split("\t")
-
-                # Only interested in files matching pattern
-                if not fnmatch.fnmatch(filename, os.path.basename(uri.path)):
-                    continue
-                if re.search("nonredundant_protein", filename) or re.search(
-                    "wp_protein", filename
-                ):
-                    continue
-
-                file_path = os.path.join(dest_dir, os.path.basename(filename))
-                if os.path.exists(file_path):
-                    if skip_download_if_file_present:
-                        logging.info(
-                            f"{orig_source_name} file already exists, skipping download ({file_path})"
-                        )
-                        continue
-                    os.remove(file_path)
-
-                file_url = os.path.join(os.path.dirname(file), filename)
-                files_to_download.append({"url": file_url, "path": file_path})
-                logging.info(
-                    f"{orig_source_name} file downloaded via HTTP: {file_path}"
-                )
+                if refseq_file:
+                    checksum, filename = refseq_file.split("\t")
+
+                    # Only interested in files matching pattern and not non-redundant or wp_protein
+                    if fnmatch.fnmatch(filename, os.path.basename(uri.path)) and not re.search("nonredundant_protein|wp_protein", filename):
+                        file_path = os.path.join(dest_dir, os.path.basename(filename))
+                        if os.path.exists(file_path):
+                            if skip_download_if_file_present:
+                                logging.info(f"{source_name} file already exists, skipping download ({file_path})")
+                                continue
+                            os.remove(file_path)
+
+                        file_url = os.path.join(os.path.dirname(file), filename)
+                        files_to_download.append({"url": file_url, "path": file_path, "type": source_name})
 
             self.refseq_multithreading(files_to_download)
         elif uri.scheme == "ftp":
-            ftp = FTP(uri.netloc)
-            ftp.login("anonymous", "-anonymous@")
-            ftp.cwd(os.path.dirname(uri.path))
-            remote_files = ftp.nlst()
-
-            # Download files in ftp server
-            for remote_file in remote_files:
-                # Only interested in files matching pattern
-                if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)):
-                    continue
-
-                remote_file = re.sub(r"\n", "", remote_file)
-                file_path = os.path.join(dest_dir, os.path.basename(remote_file))
-                if db and db == "checksum":
-                    file_path = os.path.join(
-                        dest_dir, f"{source_name}-{os.path.basename(remote_file)}"
-                    )
-
-                if not (skip_download_if_file_present and os.path.exists(file_path)):
-                    ftp.retrbinary("RETR " + remote_file, open(file_path, "wb").write)
-                    logging.info(
-                        f"{orig_source_name} file downloaded via FTP: {file_path}"
-                    )
-                else:
-                    logging.info(
-                        f"{orig_source_name} file already exists, skipping download ({file_path})"
-                    )
-                ftp.close()
-        elif uri.scheme == "http" or uri.scheme == "https":
-            # This is the case for the release file
+            file_path = self.download_via_ftp(file, dest_dir, db, source_name, skip_download_if_file_present)
+        elif uri.scheme in ["http", "https"]:
+            # This is the case for the RefSeq release file
             if re.search("RefSeq", source_name) and rel_number and release:
-                # Get current release number
                 release_number = requests.get(rel_number).json()
                 if not release_number:
                     raise LookupError(f"No release number in {rel_number}")
@@ -342,42 +242,51 @@ def download_file(self, file: str, base_path: str, source_name: str, extra_args:
                 uri = urlparse(file)
 
             file_path = os.path.join(dest_dir, os.path.basename(uri.path))
-            if db and db == "checksum":
-                file_path = os.path.join(
-                    dest_dir, f"{source_name}-{os.path.basename(uri.path)}"
-                )
+            if db == "checksum":
+                file_path = os.path.join(dest_dir, f"{source_name_clean}-{os.path.basename(uri.path)}")
 
-            if not os.path.exists(file_path) or not skip_download_if_file_present:
-                if not skip_download_if_file_present and os.path.exists(file_path):
-                    os.remove(file_path)
-                wget.download(file, file_path)
-                logging.info(
-                    f"{orig_source_name} file downloaded via HTTP: {file_path}"
-                )
-            else:
-                logging.info(
-                    f"{orig_source_name} file already exists, skipping download ({file_path})"
-                )
+            download_via_http(file, file_path)
         else:
             raise AttributeError(f"Invalid URL scheme {uri.scheme}")
 
-        # if release:
-        #   return file_path
-        # return os.path.dirname(file_path)
-        if re.search("RefSeq", source_name) and not release:
-            return os.path.dirname(file_path)
+        return os.path.dirname(file_path) if re.search("RefSeq", source_name) and not release else file_path
+
+    def download_via_ftp(self, ftp_url: str, dest_path: str, db: str, source_name: str, skip_download: bool) -> str:
+        uri = urlparse(ftp_url)
+
+        ftp = FTP(uri.netloc)
+        ftp.login("anonymous", "-anonymous@")
+        ftp.cwd(os.path.dirname(uri.path))
+        remote_files = ftp.nlst()
+
+        source_name_clean = re.sub(r"\/", "", source_name)
+
+        for remote_file in remote_files:
+            # Only interested in files matching pattern
+            if fnmatch.fnmatch(remote_file, os.path.basename(uri.path)):
+                file_path = os.path.join(dest_path, os.path.basename(remote_file))
+                if db == "checksum":
+                    file_path = os.path.join(dest_path, f"{source_name_clean}-{os.path.basename(remote_file)}")
+
+                if not (skip_download and os.path.exists(file_path)):
+                    ftp.retrbinary("RETR " + remote_file, open(file_path, "wb").write)
+                    logging.info(f"{source_name} file downloaded via FTP: {file_path}")
+                else:
+                    logging.info(f"{source_name} file already exists, skipping download ({file_path})")
+        ftp.quit()
+
         return file_path
 
-    def refseq_multithreading(self, files: List[str]) -> None:
+    def refseq_multithreading(self, files: List[Dict[str, str]]) -> None:
         """Creates multiple threads to download RefSeq files in parallel.
 
         Parameters
         ----------
-        files: list
+        files: List[Dict[str, str]]
             The list of file URLs and paths to download.
         """
         number_of_threads = 20
-        chunk_size = int(len(files) / number_of_threads)
+        chunk_size = len(files) // number_of_threads
         threads = []
 
         for thread_index in range(number_of_threads):
@@ -397,12 +306,12 @@ def refseq_multithreading(self, files: List[str]) -> None:
         for thread in threads:
             thread.join()
 
-    def download_refseq_files(self, files: List[str], start: int, end: int) -> None:
+    def download_refseq_files(self, files: List[Dict[str, str]], start: int, end: int) -> None:
         """Downloads RefSeq files from a subset of files.
 
         Parameters
         ----------
-        files: list
+        files: List[Dict[str, str]]
             The list of file URLs and paths to download.
         start: int
             The start index of the files list.
@@ -415,20 +324,19 @@ def download_refseq_files(self, files: List[str], start: int, end: int) -> None:
             If file download fails all attempts.
         """
         for index in range(start, end):
-            failed = 0
             file_url = files[index]["url"]
             local_path = files[index]["path"]
+            source_name = files[index]["type"]
 
-            for retry in range(0, 3):
+            for attempt in range(3):
                 try:
                     wget.download(file_url, local_path)
-                except:
-                    failed += 1
-                    continue
-                break
-
-            if failed > 0:
-                raise BufferError(f"Failed to download file {file_url}")
+                    logging.info(f"{source_name} file downloaded via HTTP: {local_path}")
+                    break
+                except Exception as e:
+                    logging.warning(f"Attempt {attempt + 1} failed to download {file_url}: {e}")
+                    if attempt == 2:
+                        raise Exception(f"Failed to download file {file_url} after 3 attempts")
 
     def get_dbi(self, url: str) -> Connection:
         """Returns a DB connection for a provided URL.
@@ -436,117 +344,144 @@ def get_dbi(self, url: str) -> Connection:
         Parameters
         ----------
         url: str
-            The database URL to connect to
+            The database URL to connect to.
 
         Returns
         -------
-        An sqlalchemy engine connection.
+        Connection
+            An sqlalchemy engine connection.
         """
-        connect_url = make_url(url)
-        engine = create_engine(connect_url, isolation_level="AUTOCOMMIT")
-
+        engine = self.get_db_engine(url)
         return engine.connect()
 
-    def get_db_engine(self, url: str) -> Engine:
+    def get_db_engine(self, url: str, isolation_level: str = "AUTOCOMMIT") -> Engine:
         """Returns a DB engine for a provided URL.
 
         Parameters
         ----------
         url: str
-            The database URL to create an engine for
+            The database URL to create an engine for.
 
         Returns
         -------
-        An sqlalchemy engine.
+        Engine
+            An sqlalchemy engine.
         """
         connect_url = make_url(url)
-        engine = create_engine(connect_url, isolation_level="AUTOCOMMIT")
+        engine = create_engine(connect_url, isolation_level=isolation_level)
 
         return engine
 
     def load_checksum(self, path: str, url: str) -> None:
         """Loads the xref checksum files into a provided database.
-        This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB.
+        This first combines the checksum data from different xref sources into several chunk files before loading into the DB.
+        These files are finally combined into one checksum.txt file.
 
         Parameters
         ----------
         path: str
-            The path where the checksum files can be found
+            The path where the checksum files can be found.
         url: str
-            The database URL to load the checksum data into
+            The database URL to load the checksum data into.
+
+        Raises
+        ------
+        LookupError
+            If no source_id is found for a source name.
         """
         checksum_dir = os.path.join(path, "Checksum")
-        if not os.path.exists(checksum_dir):
-            os.makedirs(checksum_dir, exist_ok=True)
+        os.makedirs(checksum_dir, exist_ok=True)
 
         output_files = []
         threshold = 50000000
         counter = 1
-        source_id = 1
         output_fh = None
 
         # Connect to db
-        url = url + "?local_infile=1"
+        url = f"{url}?local_infile=1"
         db_engine = self.get_db_engine(url)
         with db_engine.connect() as dbi:
             # Get all checksum files
-            files = os.listdir(checksum_dir)
+            files = [f for f in os.listdir(checksum_dir) if not re.search("checksum", f)]
 
-            # Go through all available checksum files
+            # Process each checksum file
             index = 0
             for checksum_file in files:
-                if re.search("checksum", checksum_file):
-                    continue
-
                 # Get the source name and ID
                 input_file = os.path.join(checksum_dir, checksum_file)
-                match = re.search(r"\/([A-Za-z]*)-.*$", input_file)
-                source_name = match.group(1)
-                source_id = self.get_source_id_from_name(dbi, source_name)
+                source_name = re.search(r"\/([A-Za-z]*)-.*$", input_file).group(1)
+                source_id = self.get_source_id_from_name(source_name, dbi)
+
+                if not source_id:
+                    raise LookupError(f'No source_id found for source name {source_name}')
 
                 # Open the input file
-                input_fh = self.get_filehandle(input_file)
-                for line in input_fh:
-                    # Open the output file
-                    if not output_fh or (counter % threshold) == 0:
-                        if output_fh:
-                            output_fh.close()
-                        index += 1
-                        output_file = os.path.join(
-                            checksum_dir, f"checksum_{index}.txt"
-                        )
-                        output_files.append(output_file)
-                        output_fh = open(output_file, "w")
-
-                    line = line.rstrip()
-                    (checksum_id, checksum) = re.split(r"\s+", line)
-
-                    output = [str(counter), str(source_id), checksum_id, checksum]
-                    output_str = "\t".join(output)
-                    output_fh.write(f"{output_str}\n")
-
-                    counter += 1
-
-                input_fh.close()
+                with self.get_filehandle(input_file) as input_fh:
+                    for line in input_fh:
+                        # Open the output file if needed
+                        if not output_fh or (counter % threshold) == 0:
+                            if output_fh:
+                                output_fh.close()
+
+                            index += 1
+                            output_file = os.path.join(checksum_dir, f"checksum_{index}.txt")
+                            output_files.append(output_file)
+                            output_fh = open(output_file, "w")
+
+                        checksum_id, checksum = re.split(r"\s+", line.rstrip())
+                        output_fh.write(f"{counter}\t{source_id}\t{checksum_id}\t{checksum}\n")
+                        counter += 1
 
             if output_fh:
                 output_fh.close()
 
-            # Add the data in the files to the db
+            # Load data into the database
             for output_file in output_files:
-                dbi.execute(
-                    text(
-                        f"load data local infile '{output_file}' into table checksum_xref"
+                dbi.execute(text(f"LOAD DATA LOCAL INFILE '{output_file}' INTO TABLE checksum_xref"))
+
+            # Merge the created files
+            if output_files:
+                merged_file = os.path.join(checksum_dir, "checksum.txt")
+                with open(merged_file, "w") as output_fh:
+                    for output_file in output_files:
+                        with open(output_file, "r") as input_fh:
+                            shutil.copyfileobj(input_fh, output_fh)
+                        os.remove(output_file)
+
+    def check_file_exists(self, filename: str) -> str:
+        """Checks if a file exists.
+        Tries alternative names with .gz and .Z extensions if the original file is not found.
+
+        Parameters
+        ----------
+        filename: str
+            The file to check.
+
+        Returns
+        -------
+        str
+            Original file name if found, otherwise the first alternative name found.
+        
+        Raises
+        ------
+        FileNotFoundError
+            If no file name was provided.
+            If provided file could not be found.
+        """
+        if not filename:
+            raise FileNotFoundError("No file name provided")
+
+        if not os.path.exists(filename):
+            alt_filename = re.sub(r"\.(gz|Z)$", "", filename)
+            if not os.path.exists(alt_filename):
+                alt_filename = filename + ".gz"
+                if not os.path.exists(alt_filename):
+                    raise FileNotFoundError(
+                        f"Could not find either {filename} or {alt_filename}"
                     )
-                )
+            return alt_filename
 
-            # Merge  the created files
-            merged_file = os.path.join(checksum_dir, f"checksum.txt")
-            with open(merged_file, "w") as output_fh:
-                for output_file in output_files:
-                    with open(output_file, "r") as input_fh:
-                        shutil.copyfileobj(input_fh, output_fh)
-                    os.remove(output_file)
+        return filename
 
     def get_filehandle(self, filename: str) -> IO:
         """Opens an appropriate read filehandle for a file based on its type.
@@ -554,11 +489,12 @@ def get_filehandle(self, filename: str) -> IO:
         Parameters
         ----------
         filename: str
-            The name and path of the file to read
+            The name and path of the file to read.
 
         Returns
         -------
-        A read filehandle.
+        IO
+            A read filehandle.
 
         Raises
         ------
@@ -566,41 +502,27 @@ def get_filehandle(self, filename: str) -> IO:
             If no file name was provided.
             If provided file could not be found.
         """
-        if not filename or filename == "":
-            raise FileNotFoundError("No file name")
-
-        alt_filename = filename
-        alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename)
-        if alt_filename == filename:
-            alt_filename = alt_filename + ".gz"
-
-        if not os.path.exists(filename):
-            if not os.path.exists(alt_filename):
-                raise FileNotFoundError(
-                    f"Could not find either {filename} or {alt_filename}"
-                )
-            filename = alt_filename
+        filename = self.check_file_exists(filename)
 
-        if re.search(r"\.(gz|Z)$", filename):
-            fh = gzip.open(filename, "rt")
+        if filename.endswith(('.gz', '.Z')):
+            return gzip.open(filename, "rt")
         else:
-            fh = open(filename, "r")
+            return open(filename, "r")
 
-        return fh
-
-    def get_source_id_from_name(self, dbi: Connection, source_name: str) -> int:
+    def get_source_id_from_name(self, source_name: str, dbi: Connection) -> int:
         """Retrieves a source ID from its name from a database.
 
         Parameters
         ----------
-        dbi: db connection
-            The database connection to query in
         source_name: str
-            The name of the source
+            The name of the source.
+        dbi: Connection
+            The database connection to query in.
 
         Returns
         -------
-        The source ID.
+        int
+            The source ID.
         """
         source_id = dbi.execute(
             select(SourceSORM.source_id).where(SourceSORM.name == source_name)
@@ -608,31 +530,42 @@ def get_source_id_from_name(self, dbi: Connection, source_name: str) -> int:
 
         return source_id
 
-    def get_file_sections(self, file: str, delimiter: str) -> Iterator[List[str]]:
+    def get_file_sections(self, filename: str, delimiter: str, encoding: str = None) -> Iterator[List[str]]:
         """Reads a provided file by sections, separated by a provided delimiter.
         This function uses 'yield' to provide the file sections one by one.
 
         Parameters
         ----------
         file: str
-            The name and path of the file to read
+            The name and path of the file to read.
         delimiter: str
-            The character or string separating the file sections
+            The character or string separating the file sections.
+        encoding: str
+            The encoding of the file (default is None).
 
         Returns
         -------
-        A yield of file sections.
+        Iterator[List[str]]
+            A generator yielding file sections as lists of strings.
         """
-        if re.search(r"\.(gz|Z)$", file):
-            with gzip.open(file, "rt") as fh:
-                groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
-                for key, group in groups:
+        filename = self.check_file_exists(filename)
+
+        def read_file(fh: IO) -> Iterator[List[str]]:
+            groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
+            for key, group in groups:
+                if not key:
                     yield list(group)
+
+        if filename.endswith(('.gz', '.Z')):
+            if encoding:
+                with gzip.open(filename, "rt", encoding=encoding, errors="replace") as fh:
+                    yield from read_file(fh)
+            else:
+                with gzip.open(filename, "rt") as fh:
+                    yield from read_file(fh)
         else:
-            with open(file, "r") as fh:
-                groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter))
-                for key, group in groups:
-                    yield list(group)
+            with open(filename, "r") as fh:
+                yield from read_file(fh)
 
     def create_xref_db(self, url: str, config_file: str) -> None:
         """Creates the xref database from model.
@@ -641,50 +574,53 @@ def create_xref_db(self, url: str, config_file: str) -> None:
         Parameters
         ----------
         url: str
-            The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname]
+            The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname].
         config_file: str
-            The name and path of the .ini file that has information about xref sources and species
+            The name and path of the .ini file that has information about xref sources and species.
         """
         engine = create_engine(url, isolation_level="AUTOCOMMIT")
 
         # Drop database and create again
         if database_exists(engine.url):
+            logging.info(f"Dropping existing database {engine.url.database}.")
             drop_database(engine.url)
+        logging.info(f"Creating new database {engine.url.database}.")
         create_database(engine.url)
         XrefUpdateDB.metadata.create_all(engine)
+        logging.info(f"Database {engine.url.database} created successfully.")
 
-        xref_dbi = engine.connect()
-        self.populate_xref_db(xref_dbi, config_file)
+        with engine.connect() as xref_dbi:
+            self.populate_xref_db(xref_dbi, config_file)
+            logging.info(f"Database {engine.url.database} populated successfully.")
 
     def populate_xref_db(self, dbi: Connection, config_file: str) -> None:
         """Populates the xref database with configuration data.
 
         Parameters
         ----------
-        dbi: db connection
-            The xref database connection
+        dbi: Connection
+            The xref database connection.
         config_file: str
-            The name and path of the .ini file that has information about xref sources and species to populate the database with
+            The name and path of the .ini file that has information about xref sources and species to populate the database with.
 
         Raises
         ------
         KeyError
             If a source exists in a species section in the configuration file, but has no source section of its own.
         """
-        source_ids, source_parsers, species_sources = {}, {}, {}
-        species_sections, sources_sections = {}, {}
-
         config = ConfigParser()
         config.read(config_file)
 
-        for section_name in config.sections():
-            section = config[section_name]
-            (keyword, name) = re.split(r"\s+", section_name)
+        species_sections = {
+            name.split(" ", 1)[1]: section for name, section in config.items() if name.startswith("species")
+        }
+        sources_sections = {
+            name.split(" ", 1)[1]: section for name, section in config.items() if name.startswith("source")
+        }
 
-            if keyword == "source":
-                sources_sections[name] = section
-            elif keyword == "species":
-                species_sections[name] = section
+        species_sources = {}
+        source_ids = {}
+        source_parsers = {}
 
         # Parse species sections
         for species_name, section in species_sections.items():
@@ -707,10 +643,8 @@ def populate_xref_db(self, dbi: Connection, config_file: str) -> None:
 
             species_sources[species_id] = sources
 
-        source_id = 0
         # Parse source sections
-        for source_name, section in sorted(sources_sections.items()):
-            source_id += 1
+        for source_id, (source_name, section) in enumerate(sorted(sources_sections.items()), start=1):
             source_db_name = section.get("name")
             order = section.get("order")
             priority = section.get("priority")
@@ -733,15 +667,11 @@ def populate_xref_db(self, dbi: Connection, config_file: str) -> None:
             source_ids[source_name] = source_id
             source_parsers[source_id] = parser
 
-        # Add source url rows
+        # Add source_url rows
         for species_id, sources in species_sources.items():
-            source_names = sources.split(",")
-
-            for source_name in source_names:
-                if not source_ids.get(source_name):
-                    raise KeyError(
-                        f"No source section found for {source_name} in config file"
-                    )
+            for source_name in sources.split(","):
+                if source_name not in source_ids:
+                    raise KeyError(f"No source section found for {source_name} in config file")
 
                 source_id = source_ids[source_name]
                 parser = source_parsers[source_id]
@@ -756,103 +686,98 @@ def get_source_id(self, dbi: Connection, parser: str, species_id: int, name: str
 
         Parameters
         ----------
-        dbi: db connection
-            The database connection to query in
+        dbi: Connection
+            The database connection to query in.
         parser: str
-            The source parser
+            The source parser.
         species_id: int
-            The ID of the species related to the source
+            The ID of the species related to the source.
         name: str
-            The source name
+            The source name.
         division_id: int
-            The ID of the division related to the source
+            The ID of the division related to the source.
 
         Returns
         -------
-        The source ID.
+        Optional[int]
+            The source ID or None if cannot be found.
         """
-        name = "%" + name + "%"
+        name_pattern = f"%{name}%"
         source_id = None
 
+        # Query by parser, species_id, and name pattern
         query = select(SourceURLORM.source_id).where(
             SourceUORM.source_id == SourceURLORM.source_id,
             SourceURLORM.parser == parser,
             SourceURLORM.species_id == species_id,
+            SourceUORM.name.like(name_pattern),
         )
         result = dbi.execute(query)
         if result.rowcount == 1:
-            source_id = result.scalar()
-
-        query = (
-            select(SourceURLORM.source_id)
-            .where(
-                SourceUORM.source_id == SourceURLORM.source_id,
-                SourceURLORM.parser == parser,
-                SourceURLORM.species_id == species_id,
-            )
-            .filter(SourceUORM.name.like(name))
+            return result.scalar()
+
+        # Query by parser and species_id
+        query = select(SourceURLORM.source_id).where(
+            SourceUORM.source_id == SourceURLORM.source_id,
+            SourceURLORM.parser == parser,
+            SourceURLORM.species_id == species_id,
         )
         result = dbi.execute(query)
         if result.rowcount == 1:
-            source_id = result.scalar()
-
-        if not source_id:
-            query = (
-                select(SourceURLORM.source_id)
-                .where(
-                    SourceUORM.source_id == SourceURLORM.source_id,
-                    SourceURLORM.parser == parser,
-                    SourceURLORM.species_id == division_id,
-                )
-                .filter(SourceUORM.name.like(name))
-            )
-            result = dbi.execute(query).first()
-            if result:
-                source_id = result[0]
+            return result.scalar()
 
-        return source_id
+        # Query by parser, division_id, and name pattern
+        query = select(SourceURLORM.source_id).where(
+            SourceUORM.source_id == SourceURLORM.source_id,
+            SourceURLORM.parser == parser,
+            SourceURLORM.species_id == division_id,
+            SourceUORM.name.like(name_pattern),
+        )
+        result = dbi.execute(query).scalar()
+        if result:
+            return result
+
+        return None
 
     def get_taxon_id(self, dbi: Connection) -> int:
-        """Retrieves the species.taxonomy_id value of the meta table in a database.
+        """Retrieves the species.taxonomy_id value from the meta table in a database.
 
         Parameters
         ----------
-        dbi: db connection
-            The database connection to query in
+        dbi: Connection
+            The database connection to query in.
 
         Returns
         -------
-        The taxonomy ID in the database or 1 if not found.
+        int
+            The taxonomy ID in the database or 1 if not found.
         """
         result = dbi.execute(
-            select(MetaCORM.meta_value).where(
-                MetaCORM.meta_key == "species.taxonomy_id"
-            )
+            select(MetaCORM.meta_value).where(MetaCORM.meta_key == "species.taxonomy_id")
         )
-        if result.rowcount > 0:
-            return int(result.scalar())
 
-        return 1
+        taxon_id = result.scalar()
+        return int(taxon_id) if taxon_id else 1
 
     def get_division_id(self, dbi: Connection) -> int:
-        """Retrives the division ID from a database based on the species.division value of the meta table.
+        """Retrieves the division ID from a database based on the species.division value in the meta table.
 
         Parameters
         ----------
-        dbi: db connection
-            The database connection to query in
+        dbi: Connection
+            The database connection to query in.
 
         Returns
         -------
-        The division ID in the database or 1 if not found
+        int
+            The division ID in the database or 1 if not found.
         """
         result = dbi.execute(
             select(MetaCORM.meta_value).where(MetaCORM.meta_key == "species.division")
         )
 
-        if result.rowcount > 0:
-            division = result.scalar()
-
+        division = result.scalar()
+        if division:
             division_taxon = {
                 "Ensembl": 7742,
                 "EnsemblVertebrates": 7742,
@@ -862,112 +787,105 @@ def get_division_id(self, dbi: Connection) -> int:
                 "Plants": 33090,
                 "EnsemblPlants": 33090,
             }
-
-            division_id = division_taxon.get(division)
-            if division_id:
-                return int(division_id)
+            return division_taxon.get(division, 1)
 
         return 1
 
-    def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str = None) -> str:
-        """Creates directories based on provided data.
+    def get_path(self, base_path: str, species: str, release: int, category: str, file_name: Optional[str] = None) -> str:
+        """Creates directories based on provided data and returns the full file path.
 
         Parameters
         ----------
         base_path: str
-            The base file path
+            The base file path.
         species: str
-            The species name
+            The species name.
         release: int
-            The ensEMBL release number
+            The Ensembl release number.
         category: str
-            The file category
-        file_name: str, optional
-            The file name
+            The file category.
+        file_name: Optional[str]
+            The file name.
 
         Returns
         -------
-        A file path.
+        str
+            The full file path.
         """
-        full_path = os.path.join(base_path, species, release, category)
-        if not os.path.exists(full_path):
-            os.makedirs(full_path, exist_ok=True)
+        full_path = os.path.join(base_path, species, str(release), category)
+        os.makedirs(full_path, exist_ok=True)
 
-        if file_name:
-            return os.path.join(full_path, file_name)
-        else:
-            return full_path
+        return os.path.join(full_path, file_name) if file_name else full_path
 
     def get_db_from_registry(self, species: str, group: str, release: int, registry: str) -> Optional[str]:
-        """Looks up a db in the registry and returns an sqlaclehmy angine for it.
+        """Looks up a database in the registry and returns its URL.
 
         Parameters
         ----------
         species: str
-            The species name
+            The species name.
         group: str
-            The db group (core, ccds, otherfeatures, etc...)
+            The database group (core, ccds, otherfeatures, etc.).
         release: int
-            The ensEMBL release number
+            The Ensembl release number.
         registry: str
-            The registry url
+            The registry URL.
 
         Returns
         -------
-        A db engine or 0 if no db is found.
+        Optional[str]
+            The database URL or None if no database is found.
         """
-        # Fix registry url, if needed
-        match = re.search(r"^(.*)://(.*)", registry)
-        if match:
-            registry = match.group(2)
-        match = re.search(r"(.*)/(.*)", registry)
-        if match:
-            registry = match.group(1)
-
-        metasearch_url = self.param_required("metasearch_url")
+        # Clean up registry URL if needed
+        registry = re.sub(r"^(.*://)?(.*?)(/.*)?$", r"\2", registry)
+
+        metasearch_url = self.get_param("metasearch_url", {"required": True})
         metasearch_body = {
             "name_pattern": f"{species}_{group}%",
-            "filters": [
-                {"meta_key": "schema_version", "meta_value": str(release)},
-            ],
+            "filters": [{"meta_key": "schema_version", "meta_value": str(release)}],
             "servers": [registry],
         }
 
-        dbs = requests.post(metasearch_url, json=metasearch_body).json()
-        dbs = dbs[registry]
+        response = requests.post(metasearch_url, json=metasearch_body)
+        response.raise_for_status()
+        dbs = response.json().get(registry, [])
 
-        if len(dbs) > 0:
-            db_url = "mysql://" + dbs[0]
-            return db_url
-        else:
-            return None
+        if dbs:
+            return f"mysql://{dbs[0]}"
+        return None
 
-    def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: str = None, registry: str = None) -> BasicMapper:
-        """Retrives a mapper object based on species.
+    def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release: int, core_url: Optional[str] = None, registry: Optional[str] = None) -> BasicMapper:
+        """Retrieves a mapper object based on species.
 
         Parameters
         ----------
         xref_url: str
-            The xref db connection url
+            The xref db connection URL.
         species: str
-            The species name
+            The species name.
         base_path: str
-            The base file path
+            The base file path.
         release: int
-            The ensEMBL release number
-        core_db: str, optional
-            The species core db connection url
-        registry: str, optional
-            The registry url
+            The Ensembl release number.
+        core_url: Optional[str]
+            The species core db connection URL.
+        registry: Optional[str]
+            The registry URL.
 
         Returns
         -------
-        A mapper object
+        BasicMapper
+            A mapper object.
+
+        Raises
+        ------
+        AttributeError
+            If neither core_url nor registry is provided.
         """
-        # Need either core_db or registry
+        # Need either core_url or registry
         if not core_url and not registry:
             raise AttributeError(
-                f"Method get_xref_mapper: need to provide either a core DB URL or a registry URL"
+                "Method get_xref_mapper: need to provide either a core DB URL or a registry URL"
             )
 
         # Create needed db connections
@@ -977,15 +895,13 @@ def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release:
         core_db = self.get_db_engine(core_url)
         xref_db = self.get_db_engine(xref_url)
 
-        # Extract host and dbname from xref url
+        # Extract host and dbname from xref URL
         xref_url_obj = make_url(xref_url)
         host = xref_url_obj.host
         dbname = xref_url_obj.database
 
         # Locate the fasta files
-        cdna_path = self.get_path(
-            base_path, species, release, "ensembl", "transcripts.fa"
-        )
+        cdna_path = self.get_path(base_path, species, release, "ensembl", "transcripts.fa")
         pep_path = self.get_path(base_path, species, release, "ensembl", "peptides.fa")
 
         # Try to find a species-specific mapper first
diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py
index 7edf452e0..2d990cf70 100644
--- a/src/python/ensembl/production/xrefs/Checksum.py
+++ b/src/python/ensembl/production/xrefs/Checksum.py
@@ -14,14 +14,18 @@
 
 """Checksum module for the Xref Download pipeline."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+from sqlalchemy import select, func
 
+from ensembl.xrefs.xref_source_db_model import ChecksumXref as ChecksumXrefSORM
+
+from ensembl.production.xrefs.Base import Base
 
 class Checksum(Base):
     def run(self):
-        base_path     = self.param_required("base_path", {"type": "str"})
-        source_db_url = self.param_required("source_db_url", {"type": "str"})
-        skip_download = self.param_required("skip_download", {"type": "bool"})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        skip_download: bool = self.get_param("skip_download", {"required": True, "type": bool})
 
         logging.info("Checksum starting with parameters:")
         logging.info(f"Param: base_path = {base_path}")
@@ -32,15 +36,17 @@ def run(self):
         db_engine = self.get_db_engine(source_db_url)
 
         # Check if checksums already exist
-        table_nonempty = 0
-        if skip_download:
-            with db_engine.connect() as dbi:
-                query = select(func.count(ChecksumXrefSORM.checksum_xref_id))
-                table_nonempty = dbi.execute(query).scalar()
+        table_empty = self.check_table_empty(db_engine) if skip_download else True
 
         # Load checksums from files into db
-        if not table_nonempty:
+        if table_empty:
             self.load_checksum(base_path, source_db_url)
             logging.info("Checksum data loaded")
         else:
             logging.info("Checksum data already exists, skipping loading")
+
+    def check_table_empty(self, db_engine):
+        """Check if the checksum table is empty."""
+        with db_engine.connect() as dbi:
+            query = select(func.count(ChecksumXrefSORM.checksum_xref_id))
+            return dbi.execute(query).scalar() == 0
diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py
index f3b9f20f4..b57407938 100644
--- a/src/python/ensembl/production/xrefs/DownloadSource.py
+++ b/src/python/ensembl/production/xrefs/DownloadSource.py
@@ -14,39 +14,48 @@
 
 """Download module to download xref and version files."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+from sqlalchemy import select
+from sqlalchemy.dialects.mysql import insert
+from typing import Optional
 
+from ensembl.xrefs.xref_source_db_model import (
+    Source as SourceSORM,
+    Version as VersionORM,
+)
+
+from ensembl.production.xrefs.Base import Base
 
 class DownloadSource(Base):
     def run(self):
-        base_path     = self.param_required("base_path", {"type": "str"})
-        parser        = self.param_required("parser", {"type": "str"})
-        name          = self.param_required("name", {"type": "str"})
-        priority      = self.param_required("priority", {"type": "int"})
-        source_db_url = self.param_required("source_db_url", {"type": "str"})
-        file          = self.param_required("file", {"type": "str"})
-        skip_download = self.param_required("skip_download", {"type": "bool"})
-        db            = self.param("db", None, {"type": "str"})
-        version_file  = self.param("version_file", None, {"type": "str"})
-        rel_number    = self.param("rel_number", None, {"type": "str"})
-        catalog       = self.param("catalog", None, {"type": "str"})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        parser: str = self.get_param("parser", {"required": True, "type": str})
+        name: str = self.get_param("name", {"required": True, "type": str})
+        priority: int = self.get_param("priority", {"required": True, "type": int})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        file: str = self.get_param("file", {"required": True, "type": str})
+        skip_download: bool = self.get_param("skip_download", {"required": True, "type": bool})
+        db: Optional[str] = self.get_param("db", {"type": str})
+        version_file: Optional[str] = self.get_param("version_file", {"type": str})
+        rel_number: Optional[str] = self.get_param("rel_number", {"type": str})
+        catalog: Optional[str] = self.get_param("catalog", {"type": str})
 
         logging.info(f"DownloadSource starting for source {name}")
 
         # Download the main xref file
-        extra_args = {}
-        extra_args["skip_download_if_file_present"] = skip_download
-        extra_args["db"] = db
+        extra_args = {
+            "skip_download_if_file_present": skip_download,
+            "db": db
+        }
         if rel_number and catalog:
-            extra_args["rel_number"] = rel_number
-            extra_args["catalog"] = catalog
-        file_name = self.download_file(file, base_path, name, extra_args)
+            extra_args.update({"rel_number": rel_number, "catalog": catalog})
+        file_path = self.download_file(file, base_path, name, extra_args)
 
         # Download the version file
-        version = ""
+        version_path = None
         if version_file:
             extra_args["release"] = "version"
-            version = self.download_file(version_file, base_path, name, extra_args)
+            version_path = self.download_file(version_file, base_path, name, extra_args)
 
         # Update source db
         db_engine = self.get_db_engine(source_db_url)
@@ -54,20 +63,20 @@ def run(self):
             dbi.execute(
                 insert(SourceSORM)
                 .values(name=name, parser=parser)
-                .prefix_with("IGNORE")
+                .on_duplicate_key_update(parser=parser)
             )
-
             source_id = dbi.execute(
                 select(SourceSORM.source_id).where(SourceSORM.name == name)
             ).scalar()
+
             dbi.execute(
                 insert(VersionORM)
                 .values(
                     source_id=source_id,
-                    file_path=file_name,
+                    file_path=file_path,
                     db=db,
                     priority=priority,
-                    revision=version,
+                    revision=version_path,
                 )
-                .prefix_with("IGNORE")
+                .on_duplicate_key_update(revision=version_path)
             )
diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py
index 4295041a0..f574f2dc3 100644
--- a/src/python/ensembl/production/xrefs/EmailNotification.py
+++ b/src/python/ensembl/production/xrefs/EmailNotification.py
@@ -14,238 +14,52 @@
 
 """Email module to send user emails notifying of xref pipelines end, with important information and statistics."""
 
-from ensembl.production.xrefs.Base import *
-
+import os
+import re
 from smtplib import SMTP
 from email.message import EmailMessage
+from typing import Dict, Any, Tuple
 
+from ensembl.production.xrefs.Base import Base
 
 class EmailNotification(Base):
+    INDENT = "&nbsp;&nbsp;&nbsp;"
+
     def run(self):
-        pipeline_name = self.param_required("pipeline_name", {"type": "str"})
-        base_path     = self.param_required("base_path", {"type": "str"})
-        release       = self.param_required("release", {"type": "int"})
-        email_address = self.param_required("email", {"type": "str"})
-        email_server  = self.param_required("email_server", {"type": "str"})
-        log_timestamp = self.param("log_timestamp", None, {"type": "str"})
+        pipeline_name: str = self.get_param("pipeline_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        email_address: str = self.get_param("email", {"required": True, "type": str})
+        email_server: str = self.get_param("email_server", {"required": True, "type": str})
+        log_timestamp: str = self.get_param("log_timestamp", {"type": str})
 
         email_message = f"The <b>{pipeline_name}</b> has completed its run.<br>"
 
-        indent = "&nbsp;&nbsp;&nbsp;"
-
         if log_timestamp:
             # Get the path of the log files
             log_path = os.path.join(base_path, "logs", log_timestamp)
 
-            # Read the log file
             if os.path.exists(log_path):
-                parameters = {}
+                # Combine the logs into a single file
+                main_log_file = self.combine_logs(base_path, log_timestamp, pipeline_name)
 
-                # Copy different log files into a main one
-                main_log_file = self.combine_logs(
-                    base_path, log_timestamp, pipeline_name
-                )
-
-                # Read the full logs
+                # Read the logs
                 with open(main_log_file) as fh:
                     data = fh.read()
 
-                # Extract parameter data
-                parameters_list = re.findall(
-                    r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data
-                )
-                parameters = {param[0]: param[1] for param in parameters_list}
-
-                email_message += (
-                    "<br>The pipeline was run with the following parameters:<br>"
-                )
-                for param_name, param_value in parameters.items():
-                    if param_value == "1" or param_value == "0":
-                        param_value = bool(param_value)
-                    email_message += f"<b>{param_name}</b> = {param_value}<br>"
+                # Extract the parameters and format them
+                parameters = self.extract_parameters(data)
+                email_message += self.format_parameters(parameters)
 
                 # Extract statistics data from logs
                 if re.search("Download", pipeline_name):
-                    sources_data, added_species, skipped_species = {}, {}, {}
-
-                    # Get sources scheduled for download
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)",
-                        data,
-                    )
-                    sources_data = {
-                        source: {"to_download": 1} for source in matches_list
-                    }
-
-                    # Get sources scheduled for cleanup
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)",
-                        data,
-                    )
-                    for source in matches_list:
-                        sources_data[source].update({"to_cleanup": 1})
-
-                    # Get sources cleaned up
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up",
-                        data,
-                    )
-                    for source in matches_list:
-                        sources_data[source].update({"cleaned_up": 1})
-
-                    # Get sources with skipped download
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)",
-                        data,
-                    )
-                    for source in matches_list:
-                        sources_data[source[0]].update(
-                            {"skipped": os.path.dirname(source[1])}
-                        )
-
-                    # Get sources downloaded
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)",
-                        data,
-                    )
-                    for source in matches_list:
-                        sources_data[source[0]].update(
-                            {"downloaded": source[1] + "|" + os.path.dirname(source[2])}
-                        )
-
-                    # Get sources copied from local ftp
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)",
-                        data,
-                    )
-                    for source in matches_list:
-                        sources_data[source[0]].update(
-                            {"copied": os.path.dirname(source[1])}
-                        )
-
-                    # Get skipped species
-                    skipped_species_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) skipped species = (\d+)",
-                        data,
-                    )
-                    skipped_species = {
-                        source[0]: source[1] for source in skipped_species_list
-                    }
-
-                    # Get species with files created
-                    added_species_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) species files created = (\d+)",
-                        data,
-                    )
-                    added_species = {
-                        source[0]: source[1] for source in added_species_list
-                    }
-
-                    # Add source statistics to email message
-                    email_message += "<br>--Source Statistics--<br>"
-                    for source_name, source_values in sources.items():
-                        email_message += f"<b>{source_name}:</b><br>"
-                        if source_values.get("to_download"):
-                            email_message += f"{indent}Scheduled for download &#10004;<br>"
-
-                        if source_values.get("downloaded"):
-                            (download_type, file_path) = source_values[
-                                "downloaded"
-                            ].split("|")
-                            email_message += f"{indent}File downloaded via {download_type} into {file_path}<br>"
-                        elif source_values.get("copied"):
-                            email_message += (
-                                indent
-                                + "File(s) copied from local FTP into %s<br>"
-                                % (source_values["copied"])
-                            )
-                        elif source_values.get("skipped"):
-                            email_message += (
-                                indent
-                                + "File(s) download skipped, already exists in %s<br>"
-                                % (source_values["skipped"])
-                            )
-
-                        if source_values.get("to_cleanup"):
-                            email_message += f"{indent}Scheduled for cleanup &#10004;<br>"
-                        if source_values.get("cleaned_up"):
-                            email_message += f"{indent}Cleaned up &#10004;<br>"
-
-                    # Add species statistics to email message
-                    email_message += "<br>--Species Statistics--<br>"
-                    email_message += "Skipped Species (files already exist):<br>"
-                    for source_name, count in skipped_species.items():
-                        email_message += f"{indent}{source_name}: {count}<br>"
-                    email_message += "Added Species (files created):<br>"
-                    for source_name, count in added_species.items():
-                        email_message += f"{indent}{source_name}: {count}<br>"
-
-                    email_message += "<br>To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--source_db_url</b>, and <b>--config_file</b> values provided to this pipeline."
+                    sources_data, added_species, skipped_species = self.extract_download_statistics(data)
+                    email_message += self.format_download_statistics(sources_data, added_species, skipped_species)
                 elif re.search("Process", pipeline_name):
-                    parsed_sources, species_counts = {}, {}
-
-                    # Get species mapped
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Mapping starting for species '([\w\/]+)'",
-                        data,
-                    )
-                    for species_name in matches_list:
-                        species_counts[species_name] = {
-                            "DIRECT": 0,
-                            "INFERRED_PAIR": 0,
-                            "MISC": 0,
-                            "CHECKSUM": 0,
-                            "DEPENDENT": 0,
-                            "SEQUENCE_MATCH": 0,
-                        }
-
-                    # Get number of xrefs added per species per source
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tLoaded (\d+) ([\w\/]+) xrefs for '([\w\/]+)'",
-                        data,
-                    )
-                    for species in matches_list:
-                        count = int(species[0])
-                        xref_type = species[1]
-                        species_name = species[2]
-
-                        prev_count = species_counts[species_name][xref_type]
-                        count += prev_count
-
-                        species_counts[species_name][xref_type] = count
-
-                    # Get parsed sources per species
-                    matches_list = re.findall(
-                        r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'",
-                        data,
-                    )
-                    for species in matches_list:
-                        source_name = species[0]
-                        parser = species[1]
-                        species_name = species[2]
-
-                        parsed_sources[species_name].update({source_name: parser})
-
-                    # Add species statistics to email message
-                    email_message += "<br>--Species Statistics--<br>"
-                    for species_name, species_data in parsed_sources.items():
-                        email_message += f"<b>{species_name}:</b><br>"
-                        email_message += f"{indent}Sources parsed: " + ",".join(keys(species_data))
-
-                        xref_counts = species_counts[species_name]
-                        email_message += indent + "Xrefs added: "
-                        for xref_type, count in xref_counts.items():
-                            email_message += f"{count} {xref_type} "
+                    parsed_sources, species_counts = self.extract_process_statistics(data)
+                    email_message += self.format_process_statistics(parsed_sources, species_counts)
 
         # Send email
-        message = EmailMessage()
-        message["Subject"] = f"{pipeline_name} Finished"
-        message["From"] = email_address
-        message["To"] = email_address
-        message.set_content(email_message, "html")
-
-        smtp = SMTP(email_server)
-        smtp.send_message(message)
+        self.send_email(email_address, email_server, pipeline_name, email_message)
 
     def combine_logs(self, base_path: str, timestamp: str, type: str) -> str:
         ordered_processes = {
@@ -277,18 +91,12 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str:
                 "EmailNotification",
             ],
         }
-        log_order = (
-            ordered_processes["download"]
-            if re.search("Download", type)
-            else ordered_processes["process"]
-        )
+        log_order = ordered_processes["download"] if re.search("Download", type) else ordered_processes["process"]
 
         log_path = os.path.join(base_path, "logs", timestamp)
         log_files = os.listdir(log_path)
 
-        main_log_file = os.path.join(
-            base_path, "logs", timestamp, "logfile_" + timestamp
-        )
+        main_log_file = os.path.join(base_path, "logs", timestamp, "logfile_" + timestamp)
 
         # Copy different log files into a main one
         with open(main_log_file, "a") as out_fh:
@@ -297,10 +105,159 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str:
                 matches = [s for s in log_files if re.search(pattern, s)]
 
                 for log_file in matches:
-                    log_file = os.path.join(log_path, log_file)
-                    with open(log_file) as in_fh:
-                        log_data = in_fh.read()
-                        out_fh.write(log_data)
-                    os.remove(log_file)
+                    log_file_path = os.path.join(log_path, log_file)
+                    with open(log_file_path) as in_fh:
+                        out_fh.write(in_fh.read())
+                    os.remove(log_file_path)
 
         return main_log_file
+
+    def extract_parameters(self, data: str) -> Dict[str, str]:
+        parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data)
+        return {param[0]: param[1] for param in parameters_list}
+
+    def format_parameters(self, parameters: Dict[str, str]) -> str:
+        message = "<br>The pipeline was run with the following parameters:<br>"
+        for param_name, param_value in parameters.items():
+            message += f"<b>{param_name}</b> = {param_value}<br>"
+
+        return message
+
+    def extract_download_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, str], Dict[str, str]]:
+        sources_data = self.extract_sources_data(data)
+        skipped_species = self.extract_skipped_species(data)
+        added_species = self.extract_added_species(data)
+
+        return sources_data, added_species, skipped_species
+
+    def extract_sources_data(self, data: str) -> Dict[str, Dict[str, Any]]:
+        sources_data = {}
+
+        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", "to_download"))
+        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", "to_cleanup"))
+        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", "cleaned_up"))
+        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", "skipped", True))
+        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", "downloaded", True))
+        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", "copied", True))
+
+        return sources_data
+
+    def extract_sources(self, data: str, pattern: str, key: str, split: bool = False) -> Dict[str, Dict[str, Any]]:
+        sources = {}
+
+        matches_list = re.findall(pattern, data)
+        for match in matches_list:
+            if split:
+                if key == "skipped" or key == "copied":
+                    val = os.path.dirname(match[1])
+                else:
+                    val = f"{match[1]}|" + os.path.dirname(match[2])
+                sources[match[0]] = {key: val}
+            else:
+                sources[match] = {key: True}
+
+        return sources
+
+    def extract_skipped_species(self, data: str) -> Dict[str, str]:
+        skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) skipped species = (\d+)", data)
+        return {species[0]: species[1] for species in skipped_species_list}
+
+    def extract_added_species(self, data: str) -> Dict[str, str]:
+        added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) species files created = (\d+)", data)
+        return {species[0]: species[1] for species in added_species_list}
+
+    def format_download_statistics(self, sources_data: Dict[str, Dict[str, Any]], added_species: Dict[str, str], skipped_species: Dict[str, str]) -> str:
+        message = "<br>--Source Statistics--<br>"
+
+        for source_name, source_values in sources_data.items():
+            message += f"<b>{source_name}:</b><br>"
+            if source_values.get("to_download"):
+                message += f"{self.INDENT}Scheduled for download &#10004;<br>"
+            if source_values.get("downloaded"):
+                download_type, file_path = source_values["downloaded"].split("|")
+                message += f"{self.INDENT}File downloaded via {download_type} into {file_path}<br>"
+            elif source_values.get("copied"):
+                message += f"{self.INDENT}File(s) copied from local FTP into {source_values['copied']}<br>"
+            elif source_values.get("skipped"):
+                message += f"{self.INDENT}File(s) download skipped, already exists in {source_values['skipped']}<br>"
+            if source_values.get("to_cleanup"):
+                message += f"{self.INDENT}Scheduled for cleanup &#10004;<br>"
+            if source_values.get("cleaned_up"):
+                message += f"{self.INDENT}Cleaned up &#10004;<br>"
+
+        message += "<br>--Species Statistics--<br>"
+        message += "Skipped Species (files already exist):<br>"
+        for source_name, count in skipped_species.items():
+            message += f"{self.INDENT}{source_name}: {count}<br>"
+        message += "Added Species (files created):<br>"
+        for source_name, count in added_species.items():
+            message += f"{self.INDENT}{source_name}: {count}<br>"
+
+        message += "<br>To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--source_db_url</b>, <b>--split_files_by_species</b>, and <b>--config_file</b> values provided to this pipeline."
+        return message
+
+    def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, int]]]:
+        parsed_sources = self.extract_parsed_sources(data)
+        species_counts = self.extract_species_counts(data)
+
+        return parsed_sources, species_counts
+
+    def extract_parsed_sources(self, data: str) -> Dict[str, Dict[str, str]]:
+        parsed_sources = {}
+
+        matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'", data)
+        for species in matches_list:
+            source_name, parser, species_name = species
+            if species_name not in parsed_sources:
+                parsed_sources[species_name] = {}
+            parsed_sources[species_name][source_name] = parser
+
+        return parsed_sources
+
+    def extract_species_counts(self, data: str) -> Dict[str, Dict[str, int]]:
+        species_counts = {}
+
+        # Get species mapped
+        matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Mapping starting for species '([\w\/]+)'", data)
+        for species_name in matches_list:
+            species_counts[species_name] = {
+                "DIRECT": 0,
+                "INFERRED_PAIR": 0,
+                "MISC": 0,
+                "CHECKSUM": 0,
+                "DEPENDENT": 0,
+                "SEQUENCE_MATCH": 0,
+            }
+
+        # Get number of xrefs added per species per source
+        matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tLoaded (\d+) ([\w\/]+) xrefs for '([\w\/]+)'", data)
+        for species in matches_list:
+            count, xref_type, species_name = int(species[0]), species[1], species[2]
+            species_counts[species_name][xref_type] += count
+
+        return species_counts
+
+    def format_process_statistics(self, parsed_sources: Dict[str, Dict[str, str]], species_counts: Dict[str, Dict[str, int]]) -> str:
+        message = "<br>--Species Statistics--<br>"
+
+        for species_name, species_data in parsed_sources.items():
+            message += f"<b>{species_name}:</b><br>"
+            message += f"{self.INDENT}Sources parsed: " + ",".join(species_data.keys()) + "<br>"
+
+            xref_counts = species_counts[species_name]
+            message += f"{self.INDENT}Xrefs added: "
+            for xref_type, count in xref_counts.items():
+                message += f"{count} {xref_type} "
+            message += "<br>"
+
+        return message
+
+    def send_email(self, email_address: str, email_server: str, pipeline_name: str, email_message: str) -> None:
+        message = EmailMessage()
+        message["Subject"] = f"{pipeline_name} Finished"
+        message["From"] = email_address
+        message["To"] = email_address
+        message.set_content(email_message, "html")
+
+        with SMTP(email_server) as smtp:
+            smtp.send_message(message)
diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
index eeddf94e1..19388b9fb 100644
--- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py
+++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
@@ -14,16 +14,26 @@
 
 """Scheduling module to create cleanup jobs for specific xref sources."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+import os
+import re
+from typing import Optional
+from sqlalchemy import select
 
+from ensembl.xrefs.xref_source_db_model import (
+    Source as SourceSORM,
+    Version as VersionORM,
+)
+
+from ensembl.production.xrefs.Base import Base
 
 class ScheduleCleanup(Base):
     def run(self):
-        base_path              = self.param_required("base_path", {"type": "str"})
-        source_db_url          = self.param_required("source_db_url", {"type": "str"})
-        clean_files            = self.param("clean_files", None, {"type": "bool"})
-        clean_dir              = self.param("clean_dir", None, {"type": "str"})
-        split_files_by_species = self.param("split_files_by_species", None, {"type": "bool"})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        clean_files: Optional[bool] = self.get_param("clean_files", {"type": bool})
+        clean_dir: Optional[str] = self.get_param("clean_dir", {"type": str})
+        split_files_by_species: Optional[bool] = self.get_param("split_files_by_species", {"type": bool})
 
         logging.info("ScheduleCleanup starting with parameters:")
         logging.info(f"Param: base_path = {base_path}")
@@ -41,6 +51,7 @@ def run(self):
             )
             sources = dbi.execute(query).mappings().all()
 
+        cleanup_sources = 0
         for source in sources:
             # Only cleaning RefSeq and UniProt for now
             if not (
@@ -50,14 +61,18 @@ def run(self):
                 continue
 
             # Remove / char from source name to access directory
-            clean_name = source.name
-            clean_name = re.sub(r"\/", "", clean_name)
+            clean_name = re.sub(r"\/", "", source.name)
 
             # Send parameters into cleanup jobs for each source
-            if os.path.exists(os.path.join(base_path, clean_name)):
+            source_path = os.path.join(base_path, clean_name)
+            if os.path.exists(source_path):
+                cleanup_sources += 1
                 logging.info(f"Source to cleanup: {source.name}")
 
                 self.write_output(
                     "cleanup_sources",
                     {"name": source.name, "version_file": source.revision},
                 )
+
+        if cleanup_sources == 0:
+            self.write_output("cleanup_sources", {})
diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py
index f9af93454..10b2a32af 100644
--- a/src/python/ensembl/production/xrefs/ScheduleDownload.py
+++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py
@@ -14,14 +14,16 @@
 
 """Scheduling module to create download jobs for all xref sources in config file."""
 
-from ensembl.production.xrefs.Base import *
+import json
+import logging
 
+from ensembl.production.xrefs.Base import Base
 
 class ScheduleDownload(Base):
-    def run(self):
-        config_file   = self.param_required("config_file", {"type": "str"})
-        source_db_url = self.param_required("source_db_url", {"type": "str"})
-        reuse_db      = self.param_required("reuse_db", {"type": "bool"})
+    def run(self) -> None:
+        config_file: str = self.get_param("config_file", {"required": True, "type": str})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        reuse_db: bool = self.get_param("reuse_db", {"required": True, "type": bool})
 
         logging.info("ScheduleDownload starting with parameters:")
         logging.info(f"Param: config_file = {config_file}")
@@ -32,12 +34,11 @@ def run(self):
         self.create_source_db(source_db_url, reuse_db)
 
         # Extract sources to download from config file
-        sources = []
         with open(config_file) as conf_file:
             sources = json.load(conf_file)
 
-        if len(sources) < 1:
-            raise IOError(
+        if not sources:
+            raise ValueError(
                 f"No sources found in config file {config_file}. Need sources to run pipeline"
             )
 
diff --git a/src/python/ensembl/production/xrefs/config/gencode_sources.json b/src/python/ensembl/production/xrefs/config/gencode_sources.json
new file mode 100644
index 000000000..d9b0e2fa5
--- /dev/null
+++ b/src/python/ensembl/production/xrefs/config/gencode_sources.json
@@ -0,0 +1,204 @@
+[
+    {
+      "name" : "ArrayExpress",
+      "parser" : "ArrayExpressParser",
+      "file" : "Database",
+      "db" : "core",
+      "priority" : 1
+    },
+    {
+      "name" : "CCDS",
+      "parser" : "CCDSParser",
+      "file" : "Database",
+      "db" : "ccds",
+      "priority" : 1
+    },
+    {
+      "name" : "UniParc",
+      "parser" : "ChecksumParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz",
+      "db" : "checksum",
+      "priority" : 1
+    },
+    {
+      "name" : "RNACentral",
+      "parser" : "ChecksumParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz",
+      "db" : "checksum",
+      "priority" : 1
+    },
+    {
+      "name" : "DBASS3",
+      "parser" : "DBASSParser",
+      "file" : "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv",
+      "priority" : 1
+    },
+    {
+      "name" : "DBASS5",
+      "parser" : "DBASSParser",
+      "file" : "https://www.dbass.soton.ac.uk/Dbass5/DownloadCsv",
+      "priority" : 1
+    },
+    {
+      "name" : "EntrezGene",
+      "parser" : "EntrezGeneParser",
+      "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz",
+      "priority" : 1
+    },
+    {
+      "name" : "HPA",
+      "parser" : "HPAParser",
+      "file" : "https://www.proteinatlas.org/download/xref.php",
+      "priority" : 1
+    },
+    {
+      "name" : "MGI",
+      "parser" : "MGIParser",
+      "file" : "https://www.informatics.jax.org/downloads/reports/MRK_ENSEMBL.rpt",
+      "priority" : 2
+    },
+    {
+      "name" : "MGI_desc",
+      "parser" : "MGIDescParser",
+      "file" : "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt",
+      "priority" : 1
+    },
+    {
+      "name" : "MIM2GENE",
+      "parser" : "Mim2GeneParser",
+      "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen",
+      "priority" : 3
+    },
+    {
+      "name" : "MIM",
+      "parser" : "MIMParser",
+      "file" : "https://data.omim.org/downloads/ZpPlmgwjuTBK9T5vf2sFjA/omim.txt.gz",
+      "priority" : 2
+    },
+    {
+      "name" : "RFAM",
+      "parser" : "RFAMParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.seed.gz",
+      "db" : "core",
+      "priority" : 1
+    },
+    {
+      "name" : "Reactome",
+      "parser" : "ReactomeParser",
+      "file" : "https://www.reactome.org/download/current/Ensembl2Reactome_All_Levels.txt",
+      "release" : "https://www.reactome.org/ReactomeRESTfulAPI/RESTfulWS/version",
+      "priority" : 1
+    },
+    {
+      "name" : "Reactome",
+      "parser" : "ReactomeParser",
+      "file" : "https://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt",
+      "release" : "https://www.reactome.org/ReactomeRESTfulAPI/RESTfulWS/version",
+      "priority" : 2
+    },
+    {
+      "name" : "RefSeq_dna",
+      "parser" : "RefSeqParser",
+      "file" : "https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/*rna.gbff.gz",
+      "method" : "--bestn 5",
+      "query_cutoff" : 90,
+      "target_cutoff" : 90,
+      "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt",
+      "priority" : 2,
+      "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER",
+      "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed"
+    },
+    {
+        "name" : "RefSeq_dna",
+        "parser" : "RefSeqParser",
+        "file" : "https://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/*rna.gbff.gz",
+        "method" : "--bestn 5",
+        "query_cutoff" : 90,
+        "target_cutoff" : 90,
+        "release" : "https://ftp.ncbi.nih.gov/refseq/release/release-notes/RefSeq-release*.txt",
+        "priority" : 2,
+        "release_number" : "https://ftp.ncbi.nih.gov/refseq/release/RELEASE_NUMBER",
+        "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.files.installed"
+      },
+    {
+      "name" : "RefSeq_peptide",
+      "parser" : "RefSeqParser",
+      "file" : "https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/*protein.gpff.gz",
+      "method" : "--bestn 1",
+      "query_cutoff" : 100,
+      "target_cutoff" : 100,
+      "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt",
+      "priority" : 3,
+      "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER",
+      "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed"
+    },
+    {
+        "name" : "RefSeq_peptide",
+        "parser" : "RefSeqParser",
+        "file" : "https://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/*protein.gpff.gz",
+        "method" : "--bestn 1",
+        "query_cutoff" : 100,
+        "target_cutoff" : 100,
+        "release" : "https://ftp.ncbi.nih.gov/refseq/release/release-notes/RefSeq-release*.txt",
+        "priority" : 3,
+        "release_number" : "https://ftp.ncbi.nih.gov/refseq/release/RELEASE_NUMBER",
+        "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.files.installed"
+      },
+    {
+      "name" : "Refseq_import",
+      "parser" : "RefSeqCoordinateParser",
+      "file" : "Database",
+      "db" : "otherfeatures",
+      "priority" : 2
+    },
+    {
+      "name" : "UCSC_hg38",
+      "parser" : "UCSCParser",
+      "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/knownGene.txt.gz",
+      "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/README.txt",
+      "priority" : 1
+    },
+    {
+      "name" : "UCSC_mm10",
+      "parser" : "UCSCParser",
+      "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/knownGene.txt.gz",
+      "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/README.txt",
+      "priority" : 1
+    },
+    {
+      "name" : "Uniprot/SWISSPROT",
+      "parser" : "UniProtParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz",
+      "method" : "--bestn 1",
+      "query_cutoff" : 100,
+      "target_cutoff" : 100,
+      "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt",
+      "priority" : 1
+    },
+    {
+      "name" : "Uniprot/SPTREMBL",
+      "parser" : "UniProtParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_trembl.dat.gz",
+      "method" : "--bestn 1",
+      "query_cutoff" : 100,
+      "target_cutoff" : 100,
+      "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt",
+      "priority" : 1
+    },
+    {
+      "name" : "miRBase",
+      "parser" : "miRBaseParser",
+      "file" : "https://mirbase.org/download/miRNA.dat",
+      "method" : "--bestn 1",
+      "query_cutoff" : 90,
+      "target_cutoff" : 90,
+      "priority" : 1
+    },
+    {
+      "name" : "HGNC",
+      "parser" : "HGNCParser",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "db" : "ccds",
+      "priority" : 3
+    }
+]
diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
index 929450ebf..1aa80c599 100644
--- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json
+++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
@@ -9,7 +9,7 @@
     {
       "name" : "UniParc",
       "parser" : "ChecksumParser",
-      "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis",
+      "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz",
       "db" : "checksum",
       "priority" : 1
     },
@@ -20,30 +20,12 @@
       "db" : "checksum",
       "priority" : 1
     },
-    {
-      "name" : "DBASS3",
-      "parser" : "DBASSParser",
-      "file" : "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv",
-      "priority" : 1
-    },
-    {
-      "name" : "DBASS5",
-      "parser" : "DBASSParser",
-      "file" : "https://www.dbass.soton.ac.uk/Dbass5/DownloadCsv",
-      "priority" : 1
-    },
     {
       "name" : "EntrezGene",
       "parser" : "EntrezGeneParser",
       "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz",
       "priority" : 1
     },
-    {
-      "name" : "HPA",
-      "parser" : "HPAParser",
-      "file" : "https://www.proteinatlas.org/download/xref.php",
-      "priority" : 1
-    },
     {
       "name" : "MGI",
       "parser" : "MGIParser",
@@ -52,28 +34,10 @@
     },
     {
       "name" : "MGI_desc",
-      "parser" : "MGI_Desc_Parser",
+      "parser" : "MGIDescParser",
       "file" : "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt",
       "priority" : 1
     },
-    {
-      "name" : "MGI_ccds",
-      "parser" : "MGI_CCDS_Parser",
-      "file" : "https://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_mouse/CCDS.current.txt",
-      "priority" : 2
-    },
-    {
-      "name" : "MIM2GENE",
-      "parser" : "Mim2GeneParser",
-      "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen",
-      "priority" : 3
-    },
-    {
-      "name" : "MIM",
-      "parser" : "MIMParser",
-      "file" : "https://data.omim.org/downloads/ZpPlmgwjuTBK9T5vf2sFjA/omim.txt.gz",
-      "priority" : 2
-    },
     {
       "name" : "RFAM",
       "parser" : "RFAMParser",
@@ -81,17 +45,10 @@
       "db" : "core",
       "priority" : 1
     },
-    {
-      "name" : "RFAM",
-      "parser" : "CoreXrefParser",
-      "file" : "script:logic_name=>rfam_12.2_gene,object_type=>gene",
-      "db" : "core",
-      "priority" : 1
-    }
     {
       "name" : "RGD",
       "parser" : "RGDParser",
-      "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES.RAT.txt",
+      "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt",
       "priority" : 2
     },
     {
@@ -110,7 +67,7 @@
     },
     {
       "name" : "RefSeq_dna",
-      "parser" : "RefSeqGPFFParser",
+      "parser" : "RefSeqParser",
       "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*rna.gbff.gz",
       "method" : "--bestn 5",
       "query_cutoff" : 90,
@@ -122,7 +79,7 @@
     },
     {
       "name" : "RefSeq_peptide",
-      "parser" : "RefSeqGPFFParser",
+      "parser" : "RefSeqParser",
       "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*.protein.gpff.gz",
       "method" : "--bestn 1",
       "query_cutoff" : 100,
@@ -139,13 +96,6 @@
       "db" : "otherfeatures",
       "priority" : 2
     },
-    {
-      "name" : "UCSC_hg38",
-      "parser" : "UCSCParser",
-      "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/knownGene.txt.gz",
-      "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/README.txt",
-      "priority" : 1
-    },
     {
       "name" : "UCSC_mm10",
       "parser" : "UCSCParser",
@@ -218,7 +168,7 @@
     {
       "name" : "Xenbase",
       "parser" : "XenopusJamboreeParser",
-      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
+      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
       "priority" : 1
     },
     {
@@ -229,12 +179,5 @@
       "query_cutoff" : 90,
       "target_cutoff" : 90,
       "priority" : 1
-    },
-    {
-      "name" : "HGNC",
-      "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
-      "db" : "ccds",
-      "priority" : 3
     }
-]
\ No newline at end of file
+]
diff --git a/src/python/ensembl/production/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini
index ca3452245..6541d96ae 100644
--- a/src/python/ensembl/production/xrefs/config/xref_config.ini
+++ b/src/python/ensembl/production/xrefs/config/xref_config.ini
@@ -457,7 +457,7 @@ name            = MGI
 order           = 1
 priority        = 10
 prio_descr      = descriptions
-parser          = MGI_Desc_Parser
+parser          = MGIDescParser
 
 [source Reactome::MULTI]
 # Used by all species
@@ -519,7 +519,7 @@ name            = RefSeq_dna
 order           = 15
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_dna::gencode]
 # Used by human and mouse
@@ -527,7 +527,7 @@ name            = RefSeq_dna
 order           = 15
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_dna::MULTI-fungi]
 # Used by saccharomyces_cerevisiae
@@ -542,7 +542,7 @@ name            = RefSeq_dna
 order           = 15
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_dna::MULTI-complete]
 # Used by phaeodactylum_tricornutum
@@ -670,55 +670,55 @@ parser          = RefSeqParser
 name            = RefSeq_peptide
 order           = 30
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::gencode]
 name            = RefSeq_peptide
 order           = 30
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::MULTI-fungi]
 # Used by saccharomyces_cerevisiae
 name            = RefSeq_peptide
 order           = 25
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::MULTI-Plants]
 name            = RefSeq_peptide
 order           = 25
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::MULTI-complete]
 # Used by phaeodactylum_tricornutum
 name            = RefSeq_peptide
 order           = 25
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::MULTI-protozoa]
 # Used by dictyostelium_discoideum
 name            = RefSeq_peptide
 order           = 25
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::MULTI-invertebrate]
 # Used by caenorhabditis_elegans, ciona_savignyi, drosophila_melanogaster
 name            = RefSeq_peptide
 order           = 25
 priority        = 2
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide_predicted::MULTI]
-# Special source used in RefSeqGPFFParser.  No species uses this source.
+# Special source used in RefSeqParser.  No species uses this source.
 name            = RefSeq_peptide_predicted
 order           = 30
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source RefSeq_peptide::MULTI-vertebrate]
 # Used by vertebrates
@@ -726,7 +726,7 @@ name            = RefSeq_peptide
 order           = 25
 priority        = 2
 prio_descr      = refseq
-parser          = RefSeqGPFFParser
+parser          = RefSeqParser
 
 [source SGD_GENE::saccharomyces_cerevisiae]
 # Used by saccharomyces_cerevisiae

From 571d04e17e5ee2044330c84ef08ca94d31d2f45a Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-04.ebi.ac.uk>
Date: Mon, 2 Dec 2024 14:31:41 +0000
Subject: [PATCH 05/12] Download Pipeline fixes

---
 nextflow/config/xref.config                   | 30 +++++--
 nextflow/workflows/xrefDownload.nf            | 24 +++---
 scripts/xrefs/cleanup_and_split_source.pl     |  2 +-
 scripts/xrefs/run_process.pl                  | 84 ++++++++++++++++++
 .../production/xrefs/EmailNotification.py     | 85 ++++++++++++-------
 .../xrefs/config/xref_all_sources.json        |  2 +-
 src/python/scripts/run_module.py              |  2 +-
 7 files changed, 174 insertions(+), 55 deletions(-)
 create mode 100644 scripts/xrefs/run_process.pl

diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config
index a7cef685e..2518e806e 100644
--- a/nextflow/config/xref.config
+++ b/nextflow/config/xref.config
@@ -14,7 +14,6 @@ params.sources_config_file = "${params.work_dir}/ensembl-production/src/python/e
 params.source_db_url = ''
 params.skip_download = 0
 params.reuse_db = 0
-params.skip_preparse = 1
 params.split_files_by_species = 1
 params.tax_ids_file = ''
 params.update_mode = 0
@@ -23,6 +22,16 @@ params.base_path = ''
 params.clean_files = 1
 params.clean_dir = "${params.base_path}/clean_files"
 
+params.species = ''
+params.antispecies = ''
+params.division = ''
+params.run_all = 0
+
+params.history_file = ''
+params.dc_config_file = ''
+params.old_server_uri = ''
+params.registry_file = ''
+
 trace {
     enabled = true
     file = "trace"
@@ -38,19 +47,20 @@ report {
 profiles {
     slurm {
         process {
-            errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
             executor = 'slurm'
             queue = 'production'
             queueSize = 300
-            maxRetries = 2
             time = '1d'
             memory = 100.MB
 
+            errorStrategy = { task.attempt <= process.maxRetries  ? 'retry' : 'finish' }
+            maxRetries = 2
+
             withLabel:small_process {
                 memory = 200.MB
             }
 
-            withLabel: dm {
+            withLabel:dm {
                 queue = 'datamover'
                 memory = 2.GB
             }
@@ -59,8 +69,11 @@ profiles {
                 memory = 1.GB
             }
 
-            withLabel:mem4GB {
+            withLabel:cleanup_mem {
                 memory = 4.GB
+                errorStrategy = 'retry'
+                maxRetries = 0
+                time = '7d'
             }
 
             withLabel:align_mem {
@@ -68,8 +81,11 @@ profiles {
                 maxRetries = 5
                 memory = { task.attempt <= 5 ? 4.GB * (task.attempt * task.attempt) : 16.GB  }
             }
+
+            withLabel:mapping_mem {
+                memory = 4.GB
+                maxRetries = 0
+            }
         }
     }
 }
-
-
diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf
index e87458735..8034627ed 100644
--- a/nextflow/workflows/xrefDownload.nf
+++ b/nextflow/workflows/xrefDownload.nf
@@ -4,6 +4,12 @@
 params.pipeline_name = 'Xref Download Pipeline'
 params.help = false
 
+// Ensure all paths are absolute
+params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString()
+params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString()
+params.base_path = file(params.base_path).toAbsolutePath().toString()
+params.clean_dir = file(params.clean_dir).toAbsolutePath().toString()
+
 println """\
         XREF DOWNLOAD PIPELINE
         ======================
@@ -11,11 +17,9 @@ println """\
         base_path                 : ${params.base_path}
         reuse_db                  : ${params.reuse_db}
         skip_download             : ${params.skip_download}
-        skip_preparse             : ${params.skip_preparse}
         clean_files               : ${params.clean_files}
         split_files_by_species    : ${params.split_files_by_species}
         config_file               : ${params.config_file}
-        sources_config_file       : ${params.sources_config_file}
         clean_dir                 : ${params.clean_dir}
         tax_ids_file              : ${params.tax_ids_file}
         update_mode               : ${params.update_mode}
@@ -38,9 +42,6 @@ def helpMessage() {
         --skip_download             (optional)      If set to 1, source files will only be downloaded if they don't already exist in --base_path.
                                                     Default: 0
 
-        --skip_preparse             (optional)      If set to 1, the pre-parse step will be skipped (no central DB).
-                                                    Default: 1
-
         --clean_files               (optional)      If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files.
                                                     Default: 1
 
@@ -50,9 +51,6 @@ def helpMessage() {
         --config_file               (optional)      Path to the json file containing information about xref sources to download.
                                                     Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json
 
-        --sources_config_file       (optional)      Path to the ini file containing information about all xref sources and species/divisions.
-                                                    Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini
-
         --clean_dir                 (optional)      Path where to save the cleaned up files.
                                                     Default: [--base_path]/clean_files
 
@@ -111,7 +109,7 @@ process ScheduleDownload {
     timestamp = new java.util.Date().format("yyyyMMdd_HHmmss")
 
     """
-    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --base_path ${params.base_path} --log_timestamp $timestamp
     """
 }
 
@@ -144,7 +142,7 @@ process CleanupTmpFiles {
     val 'TmpCleanupDone'
 
     """
-    find ${params.base_path} -type f -name "*.tmp" -delete
+    find ${params.base_path} -path "${params.clean_dir}" -prune -o -type f -name "*.tmp" -exec rm -f {} +
     """
 }
 
@@ -180,7 +178,7 @@ process Checksum {
 }
 
 process CleanupSplitSource {
-    label 'mem4GB'
+    label 'cleanup_mem'
     tag "$src_name"
 
     input:
@@ -207,7 +205,7 @@ process CleanupSplitSource {
 }
 
 process CleanupSource {
-    label 'mem4GB'
+    label 'cleanup_mem'
     tag "$src_name"
 
     input:
@@ -240,4 +238,4 @@ process NotifyByEmail {
     """
     python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
     """
-}
+}
\ No newline at end of file
diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index f1ea08be0..0b956a31d 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -210,7 +210,7 @@
         my $species_id_str = sprintf("%04d", $species_id);
         my @digits = split('', $species_id_str);
 
-        $write_path = catdir($output_path, @digits);
+        $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]);
         make_path($write_path);
 
         $write_file = catfile($write_path, "$output_file_name-$species_id");
diff --git a/scripts/xrefs/run_process.pl b/scripts/xrefs/run_process.pl
new file mode 100644
index 000000000..0fd396aeb
--- /dev/null
+++ b/scripts/xrefs/run_process.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Data::Dumper;
+use Carp;
+use Module::Load;
+use JSON;
+
+# List of param names that should be treated as arrays
+# TO DO: make this somehow more generic
+my $array_params = {
+  'analysis_types' => 1, 'datacheck_groups' => 1
+};
+
+# Parse the command line parameters sent to the script
+my $params = parse_options();
+
+if (!defined($params->{'class'})) {
+  confess "--ERROR-- perl class not defined.";
+}
+
+# Create the module object and initialize it
+my $class = $params->{'class'};
+eval("use $class;");
+
+my $runnable = $class->new($params);
+
+# Run the job life cycle
+$runnable->fetch_input();
+$runnable->run();
+$runnable->write_output();
+
+sub parse_options {
+  my $params;
+  my %hash;
+
+  foreach my $option (@ARGV) {
+    next if ($option !~ /^-/);
+
+    $option =~ s/^-//g;
+    my @tmp = split("=", $option, 2);
+
+    if ($tmp[0] eq 'dataflow') {
+      my $decoded_dataflow = decode_json($tmp[1]);
+      while (my ($dt_key, $dt_val) = each %{$decoded_dataflow}) {
+        if ($dt_val && ($dt_val =~ /,/ || $array_params->{$dt_key})) {
+          my @values_array = split(",", $dt_val);
+          $params->{$dt_key} = \@values_array;
+        } else {
+          $params->{$dt_key} = $dt_val;
+        }
+      }
+      next;
+    }
+
+    if ($tmp[1] && ($tmp[1] =~ /,/ || $array_params->{$tmp[0]})) {
+      my @values_array = split(",", $tmp[1]);
+      $params->{$tmp[0]} = \@values_array;
+    } else {
+      $params->{$tmp[0]} = $tmp[1]
+    }
+  }
+
+  return $params;
+}
+
+__DATA__
+=pod
+=head1 NAME
+run_process.pl
+=head1 SYNOPSIS
+    run_process.pl -class=<module_name> [<options_for_the_particular_module>]
+=head1 DESCRIPTION
+run_process.pl is a generic script that is used to call runnables from a Nextflow .nf file. This script initializes the module object and runs the life cycle of that module: fetch_input(), run(), and write_output()
+=head1 LICENSE
+    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+    Copyright [2016-2022] EMBL-European Bioinformatics Institute
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+         http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software distributed under the License
+    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and limitations under the License.
+=cut
diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py
index f574f2dc3..932b0c1b7 100644
--- a/src/python/ensembl/production/xrefs/EmailNotification.py
+++ b/src/python/ensembl/production/xrefs/EmailNotification.py
@@ -32,7 +32,10 @@ def run(self):
         email_server: str = self.get_param("email_server", {"required": True, "type": str})
         log_timestamp: str = self.get_param("log_timestamp", {"type": str})
 
-        email_message = f"The <b>{pipeline_name}</b> has completed its run.<br>"
+        email_message = f"<p>The <b>{pipeline_name}</b> has completed its run.<br>"
+        if re.search("Download", pipeline_name):
+            email_message += "To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--source_db_url</b>, <b>--split_files_by_species</b>, and <b>--config_file</b> values provided to this pipeline."
+        email_message += "</p>"
 
         if log_timestamp:
             # Get the path of the log files
@@ -67,6 +70,7 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str:
                 "ScheduleDownload",
                 "DownloadSource",
                 "ScheduleCleanup",
+                "Checksum",
                 "Cleanup(.*)Source",
                 "EmailNotification",
             ],
@@ -117,7 +121,7 @@ def extract_parameters(self, data: str) -> Dict[str, str]:
         return {param[0]: param[1] for param in parameters_list}
 
     def format_parameters(self, parameters: Dict[str, str]) -> str:
-        message = "<br>The pipeline was run with the following parameters:<br>"
+        message = "<br><h5>Run Parameters</h5>"
         for param_name, param_value in parameters.items():
             message += f"<b>{param_name}</b> = {param_value}<br>"
 
@@ -133,12 +137,31 @@ def extract_download_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, An
     def extract_sources_data(self, data: str) -> Dict[str, Dict[str, Any]]:
         sources_data = {}
 
-        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", "to_download"))
-        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", "to_cleanup"))
-        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", "cleaned_up"))
-        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", "skipped", True))
-        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", "downloaded", True))
-        sources_data.update(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", "copied", True))
+        # Helper function to update sources_data
+        def update_sources_data(new_data: Dict[str, Dict[str, Any]]):
+            for key, value in new_data.items():
+                if key in sources_data:
+                    sources_data[key].update(value)
+                else:
+                    sources_data[key] = value
+
+        # Get sources set to be downloaded
+        update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", "to_download"))
+
+        # Get sources set to be cleaned up
+        update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", "to_cleanup"))
+
+        # Get sources cleaned up
+        update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", "cleaned_up"))
+
+        # Get sources skipped
+        update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", "skipped", True))
+
+        # Get sources downloaded
+        update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", "downloaded", True))
+
+        # Get sources copied
+        update_sources_data(self.extract_sources(data, r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", "copied", True))
 
         return sources_data
 
@@ -151,7 +174,7 @@ def extract_sources(self, data: str, pattern: str, key: str, split: bool = False
                 if key == "skipped" or key == "copied":
                     val = os.path.dirname(match[1])
                 else:
-                    val = f"{match[1]}|" + os.path.dirname(match[2])
+                    val = os.path.dirname(match[2])
                 sources[match[0]] = {key: val}
             else:
                 sources[match] = {key: True}
@@ -167,33 +190,31 @@ def extract_added_species(self, data: str) -> Dict[str, str]:
         return {species[0]: species[1] for species in added_species_list}
 
     def format_download_statistics(self, sources_data: Dict[str, Dict[str, Any]], added_species: Dict[str, str], skipped_species: Dict[str, str]) -> str:
-        message = "<br>--Source Statistics--<br>"
+        cell_style = 'style="border-right: 1px solid #000; padding: 5px;"'
 
+        message = "<br><h5>Source Statistics</h5>"
+        message += f"<table style=\"border-bottom: 1px solid #000;\"><tr style=\"border-bottom: 1px solid #000;\"><th {cell_style}>Source</th><th {cell_style}>Scheduled</th><th {cell_style}>Downloaded</th>"
+        message += f"<th {cell_style}>Download Skipped</th><th {cell_style}>Cleaned-up</th><th style=\"padding: 5px;\">Location</th></tr>"
         for source_name, source_values in sources_data.items():
-            message += f"<b>{source_name}:</b><br>"
-            if source_values.get("to_download"):
-                message += f"{self.INDENT}Scheduled for download &#10004;<br>"
-            if source_values.get("downloaded"):
-                download_type, file_path = source_values["downloaded"].split("|")
-                message += f"{self.INDENT}File downloaded via {download_type} into {file_path}<br>"
-            elif source_values.get("copied"):
-                message += f"{self.INDENT}File(s) copied from local FTP into {source_values['copied']}<br>"
-            elif source_values.get("skipped"):
-                message += f"{self.INDENT}File(s) download skipped, already exists in {source_values['skipped']}<br>"
-            if source_values.get("to_cleanup"):
-                message += f"{self.INDENT}Scheduled for cleanup &#10004;<br>"
-            if source_values.get("cleaned_up"):
-                message += f"{self.INDENT}Cleaned up &#10004;<br>"
-
-        message += "<br>--Species Statistics--<br>"
-        message += "Skipped Species (files already exist):<br>"
-        for source_name, count in skipped_species.items():
-            message += f"{self.INDENT}{source_name}: {count}<br>"
-        message += "Added Species (files created):<br>"
+            message += f"<tr><td {cell_style}>{source_name}</td>"
+            message += f"<td {cell_style}>X</td>" if source_values.get("to_download") else f"<td {cell_style}></td>"
+            message += f"<td {cell_style}>X</td>" if source_values.get("downloaded") or source_values.get("copied") else f"<td {cell_style}></td>"
+            message += f"<td {cell_style}>X</td>" if source_values.get("skipped") else f"<td {cell_style}></td>"
+            message += f"<td {cell_style}>X</td>" if source_values.get("to_cleanup") else f"<td {cell_style}></td>"
+            message += f"<td style=\"padding: 5px;\">{source_values.get('downloaded', source_values.get('copied', source_values.get('skipped', '')))}</td>"
+            message += "</tr>"
+        message += "</table>"
+
+        message += "<br><h5>Species Statistics</h5>"
+        message += "<b>Added Species (files created)</b>:<br><ul>"
         for source_name, count in added_species.items():
-            message += f"{self.INDENT}{source_name}: {count}<br>"
+            message += f"<li>{source_name}: {count}</li>"
+        message += "</ul>"
+        message += "<b>Skipped Species (files already exist)</b>:<br><ul>"
+        for source_name, count in skipped_species.items():
+            message += f"<li>{source_name}: {count}</li>"
+        message += "</ul>"
 
-        message += "<br>To run the Xref Process Pipeline based on the data from this pipeline, use the same <b>--source_db_url</b>, <b>--split_files_by_species</b>, and <b>--config_file</b> values provided to this pipeline."
         return message
 
     def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, int]]]:
diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
index 1aa80c599..006d8ce71 100644
--- a/src/python/ensembl/production/xrefs/config/xref_all_sources.json
+++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json
@@ -168,7 +168,7 @@
     {
       "name" : "Xenbase",
       "parser" : "XenopusJamboreeParser",
-      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
+      "file" : "https://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
       "priority" : 1
     },
     {
diff --git a/src/python/scripts/run_module.py b/src/python/scripts/run_module.py
index 874f02dd8..3b7f32cef 100644
--- a/src/python/scripts/run_module.py
+++ b/src/python/scripts/run_module.py
@@ -21,7 +21,7 @@
 def main():
   params = Params()
 
-  module_name = params.param_required('module')
+  module_name = params.get_param('module', {"required": True})
   class_name = module_name.split(".")[-1]
 
   module = importlib.import_module(module_name)

From ccf167de47ff9373e8469760379c22e6be015ab5 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@hl-codon-50-01.ebi.ac.uk>
Date: Tue, 3 Dec 2024 09:45:34 +0000
Subject: [PATCH 06/12] New xref processing pipleine

---
 nextflow/workflows/xrefProcess.nf             |   7 +-
 scripts/xrefs/coordinate_mapper.pl            | 101 +--
 scripts/xrefs/dump_ensembl.pl                 |  26 +-
 scripts/xrefs/refseq_coordinate_parser.pl     |  87 +-
 .../production/xrefs/AdvisoryXrefReport.py    |  13 +-
 .../ensembl/production/xrefs/Alignment.py     |  71 +-
 .../production/xrefs/CoordinateMapping.py     |  29 +-
 .../ensembl/production/xrefs/DirectXrefs.py   |  22 +-
 .../ensembl/production/xrefs/DumpEnsembl.py   |  69 +-
 .../ensembl/production/xrefs/DumpXref.py      |  97 +-
 .../xrefs/EmailAdvisoryXrefReport.py          |  66 +-
 .../ensembl/production/xrefs/Mapping.py       |  23 +-
 .../ensembl/production/xrefs/ParseSource.py   |  46 +-
 .../production/xrefs/ProcessAlignment.py      |  21 +-
 .../production/xrefs/RNACentralMapping.py     |  29 +-
 .../production/xrefs/ScheduleAlignment.py     |  37 +-
 .../production/xrefs/ScheduleMapping.py       |  21 +-
 .../ensembl/production/xrefs/ScheduleParse.py | 124 +--
 .../production/xrefs/ScheduleSpecies.py       | 177 ++--
 .../production/xrefs/UniParcMapping.py        |  29 +-
 .../production/xrefs/mappers/BasicMapper.py   | 209 ++---
 .../xrefs/mappers/ChecksumMapper.py           |  22 +-
 .../xrefs/mappers/CoordinateMapper.py         | 162 ++--
 .../production/xrefs/mappers/CoreInfo.py      | 370 ++++----
 .../xrefs/mappers/DirectXrefsMapper.py        |  36 +-
 .../production/xrefs/mappers/DisplayXrefs.py  | 362 +++-----
 .../xrefs/mappers/OfficialNaming.py           | 227 +++--
 .../xrefs/mappers/ProcessMappings.py          | 486 +++++-----
 .../production/xrefs/mappers/ProcessMoves.py  | 497 +++++------
 .../production/xrefs/mappers/ProcessPaired.py |  47 +-
 .../xrefs/mappers/ProcessPriorities.py        |  46 +-
 .../xrefs/mappers/RNACentralMapper.py         |   3 +-
 .../production/xrefs/mappers/TestMappings.py  | 108 ++-
 .../production/xrefs/mappers/UniParcMapper.py |   3 +-
 .../production/xrefs/mappers/XrefLoader.py    | 843 +++++++++---------
 .../xrefs/mappers/methods/ChecksumBasic.py    |  33 +-
 .../xrefs/mappers/methods/MySQLChecksum.py    |  23 +-
 .../xrefs/mappers/species/aedes_aegypti.py    |   4 +-
 .../mappers/species/anopheles_gambiae.py      |   4 +-
 .../mappers/species/culex_quinquefasciatus.py |   4 +-
 .../xrefs/mappers/species/danio_rerio.py      |   3 +-
 .../xrefs/mappers/species/drosophila.py       |   4 +-
 .../xrefs/mappers/species/eukaryota.py        |  25 +-
 .../xrefs/mappers/species/homo_sapiens.py     |   4 +-
 .../mappers/species/ixodes_scapularis.py      |   4 +-
 .../xrefs/mappers/species/mus_musculus.py     |   4 +-
 .../mappers/species/neurospora_crassa.py      |   4 +-
 .../xrefs/mappers/species/parasite.py         |   3 +-
 .../mappers/species/rattus_norvegicus.py      |   4 +-
 .../species/saccharomyces_cerevisiae.py       |   4 +-
 .../xrefs/mappers/species/sars_cov_2.py       |  14 +-
 .../species/schizosaccharomyces_pombe.py      |   4 +-
 .../xrefs/mappers/species/sus_scrofa.py       |   4 +-
 .../xrefs/mappers/species/wormbase.py         |  14 +-
 .../xrefs/parsers/ArrayExpressParser.py       |  14 +-
 .../production/xrefs/parsers/BaseParser.py    |  51 +-
 .../production/xrefs/parsers/CCDSParser.py    |   6 +-
 .../xrefs/parsers/EntrezGeneParser.py         |   8 +-
 .../production/xrefs/parsers/HGNCParser.py    |  10 +-
 .../xrefs/parsers/JGI_ProteinParser.py        |   2 +-
 .../xrefs/parsers/Mim2GeneParser.py           |   6 +-
 .../production/xrefs/parsers/RFAMParser.py    |  12 +-
 .../production/xrefs/parsers/RGDParser.py     |   2 +-
 .../xrefs/parsers/ReactomeParser.py           |   2 +-
 .../xrefs/parsers/RefSeqCoordinateParser.py   |  21 +-
 .../production/xrefs/parsers/RefSeqParser.py  |  10 +-
 .../production/xrefs/parsers/UniProtParser.py |   4 +-
 .../production/xrefs/parsers/ZFINParser.py    |   6 +-
 .../production/xrefs/parsers/miRBaseParser.py |   2 +-
 69 files changed, 2356 insertions(+), 2479 deletions(-)

diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf
index 02517aa60..8ae1d8c19 100644
--- a/nextflow/workflows/xrefProcess.nf
+++ b/nextflow/workflows/xrefProcess.nf
@@ -4,6 +4,11 @@
 params.pipeline_name = 'Xref Process Pipeline'
 params.help = false
 
+// Ensure all paths are absolute
+params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString()
+params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString()
+params.base_path = file(params.base_path).toAbsolutePath().toString()
+
 println """\
         XREF PROCESS PIPELINE
         ======================
@@ -622,6 +627,6 @@ process NotifyByEmail {
     val timestamp
 
     """
-    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --release ${params.release} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
     """
 }
\ No newline at end of file
diff --git a/scripts/xrefs/coordinate_mapper.pl b/scripts/xrefs/coordinate_mapper.pl
index 76c06775f..43fd99af8 100644
--- a/scripts/xrefs/coordinate_mapper.pl
+++ b/scripts/xrefs/coordinate_mapper.pl
@@ -20,9 +20,10 @@
 use DBI;
 use JSON;
 use Getopt::Long;
+use File::Spec::Functions qw(catfile);
 
 use Nextflow::Utils;
-use Bio::EnsEMBL::DBSQL::DBAdaptor
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
 use Bio::EnsEMBL::Mapper::RangeRegistry;
 
 my ($xref_db_url, $core_db_url, $species_id, $output_dir, $analysis_id);
@@ -35,8 +36,8 @@
 );
 
 # Check that all parameters are passed
-if (!defined($xref_db_url) || !defined($core_db_url) || !defined($species_id) || !defined($output_dir) || !defined($analysis_id)) {
-  croak "Usage: dump_ensembl.pl --xref_db_url <xref_db_url> --core_db_url <core_db_url> --species_id <species_id> --output_dir <output_dir> --analysis_id <analysis_id>";
+foreach my $param ($xref_db_url, $core_db_url, $species_id, $output_dir, $analysis_id) {
+  defined $param or croak "Usage: dump_ensembl.pl --xref_db_url <xref_db_url> --core_db_url <core_db_url> --species_id <species_id> --output_dir <output_dir> --analysis_id <analysis_id>";
 }
 
 # Set the files to use
@@ -45,27 +46,30 @@
 my $unmapped_reason_filename = catfile($output_dir, 'unmapped_reason_coord.txt');
 my $unmapped_object_filename = catfile($output_dir, 'unmapped_object_coord.txt');
 
-# Connect tp dbs
-my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url);
+# Connect to dbs
+my ($core_host, $core_port, $core_user, $core_pass, $core_dbname) = parse_url($core_db_url);
 my $core_dbi = get_dbi($core_host, $core_port, $core_user, $core_pass, $core_dbname);
 my $xref_dbi = get_dbi(parse_url($xref_db_url));
 
 # Figure out the last used IDs in the core DB
-my $xref_id = $core_dbi->selectall_arrayref('SELECT MAX(xref_id) FROM xref')->[0][0];
-my $object_xref_id = $core_dbi->selectall_arrayref('SELECT MAX(object_xref_id) FROM object_xref')->[0][0];
-my $unmapped_object_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_object_id) FROM unmapped_object')->[0][0];
-my $unmapped_reason_id = $core_dbi->selectall_arrayref('SELECT MAX(unmapped_reason_id) FROM unmapped_reason')->[0][0];
+my $xref_id = $core_dbi->selectrow_array('SELECT MAX(xref_id) FROM xref') || 0;
+my $object_xref_id = $core_dbi->selectrow_array('SELECT MAX(object_xref_id) FROM object_xref') || 0;
+my $unmapped_object_id = $core_dbi->selectrow_array('SELECT MAX(unmapped_object_id) FROM unmapped_object') || 0;
+my $unmapped_reason_id = $core_dbi->selectrow_array('SELECT MAX(unmapped_reason_id) FROM unmapped_reason') || 0;
 
 my (%unmapped, %mapped);
 my $external_db_id;
 
 # Read and store available Xrefs from the Xref database
-my $xref_sth = $xref_dbi->prepare("SELECT c.coord_xref_id,s.name,c.accession FROM coordinate_xref c,source s WHERE c.source_id=s.source_id AND c.species_id=?");
+my $xref_sth = $xref_dbi->prepare("SELECT c.coord_xref_id, s.name, c.accession FROM coordinate_xref c JOIN source s ON c.source_id = s.source_id WHERE c.species_id = ?");
 $xref_sth->bind_param(1, $species_id, SQL_INTEGER);
 $xref_sth->execute();
 
 while (my $xref = $xref_sth->fetchrow_hashref()) {
-  $external_db_id ||= $core_dbi->selectall_arrayref('SELECT external_db_id FROM external_db WHERE db_name='.$xref->{'name'})->[0][0];
+  my $sth_external_db = $core_dbi->prepare('SELECT external_db_id FROM external_db WHERE db_name = ?');
+  $sth_external_db->execute($xref->{'name'});
+  $external_db_id ||= ($sth_external_db->fetchrow_array())[0];
+  $sth_external_db->finish();
   $external_db_id ||= 11000;    # FIXME (11000 is 'UCSC')
 
   $unmapped{$xref->{'coord_xref_id'}} = {
@@ -77,16 +81,14 @@
 }
 $xref_sth->finish();
 
-if (!defined($external_db_id)) {
-  die "External_db_id is undefined for species_id = $species_id\n";
-}
+defined $external_db_id or die "External_db_id is undefined for species_id = $species_id\n";
 
 # Start the coordinate matching
 my $core_db_adaptor = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-  -host => $core_host,
-  -port => $core_port,
-  -user => $core_user,
-  -pass => $core_pass,
+  -host   => $core_host,
+  -port   => $core_port,
+  -user   => $core_user,
+  -pass   => $core_pass,
   -dbname => $core_dbname,
 );
 
@@ -111,7 +113,7 @@
   my $chr_name = $chromosome->seq_region_name();
   my @genes = @{ $chromosome->get_all_Genes( undef, undef, 1 ) };
 
-  while (my $gene = shift(@genes)) {
+  foreach my $gene (@genes) {
     my @transcripts = @{ $gene->get_all_Transcripts() };
     my %gene_result;
 
@@ -144,12 +146,7 @@
       # '$rr1' is the RangeRegistry holding Ensembl exons for one transcript at a time.
       my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
 
-      my $coding_transcript;
-      if (defined($transcript->translation())) {
-        $coding_transcript = 1;
-      } else {
-        $coding_transcript = 0;
-      }
+      my $coding_transcript = defined($transcript->translation()) ? 1 : 0;
 
       foreach my $exon (@exons) {
         # Register each exon in the RangeRegistry.  Register both the
@@ -198,7 +195,7 @@
 
         for (my $i = 0 ; $i < $exonCount ; ++$i) {
           # Register the exons from the external database in the same
-          # was as with the Ensembl exons, and calculate the overlap
+          # way as with the Ensembl exons, and calculate the overlap
           # of the external exons with the previously registered
           # Ensembl exons.
 
@@ -206,9 +203,7 @@
           $exon_match += $overlap/($exonEnds[$i] - $exonStarts[$i] + 1);
           $rr2->check_and_register('exon', $exonStarts[$i], $exonEnds[$i]);
 
-          if (!defined($cdsStart) || !defined($cdsEnd)) {
-            # Non-coding transcript.
-          } else {
+          if (defined($cdsStart) && defined($cdsEnd)) {
             my $codingStart = ($exonStarts[$i] > $cdsStart ? $exonStarts[$i] : $cdsStart);
             my $codingEnd = ($exonEnds[$i] < $cdsEnd ? $exonEnds[$i] : $cdsEnd);
 
@@ -255,7 +250,7 @@
           $coding_weight*($coding_count + $ens_weight*$rcoding_count)
         );
 
-        if (!defined( $transcript_result{$coord_xref_id}) || $transcript_result{$coord_xref_id} < $score) {
+        if (!defined($transcript_result{$coord_xref_id}) || $transcript_result{$coord_xref_id} < $score) {
           $transcript_result{$coord_xref_id} = $score;
         }
 
@@ -266,16 +261,16 @@
       # this transcript.
 
       my $best_score;
-      foreach my $coord_xref_id (sort( { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result) )) {
+      foreach my $coord_xref_id (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
         my $score = $transcript_result{$coord_xref_id};
 
         if ($score > $transcript_score_threshold) {
           $best_score ||= $score;
 
           if (sprintf("%.3f", $score) eq sprintf("%.3f", $best_score)) {
-            if (exists( $unmapped{$coord_xref_id})) {
+            if (exists($unmapped{$coord_xref_id})) {
               $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
-              delete( $unmapped{$coord_xref_id} );
+              delete($unmapped{$coord_xref_id});
               $mapped{$coord_xref_id}{'reason'}      = undef;
               $mapped{$coord_xref_id}{'reason_full'} = undef;
               $mapped{$coord_xref_id}{'chr_name'} = $chr_name;
@@ -287,21 +282,21 @@
             });
 
             # This is now a candidate Xref for the gene.
-            if (!defined( $gene_result{$coord_xref_id}) || $gene_result{$coord_xref_id} < $score) {
+            if (!defined($gene_result{$coord_xref_id}) || $gene_result{$coord_xref_id} < $score) {
               $gene_result{$coord_xref_id} = $score;
             }
           } elsif (exists($unmapped{$coord_xref_id})) {
             $unmapped{$coord_xref_id}{'reason'} = 'Was not best match';
             $unmapped{$coord_xref_id}{'reason_full'} = sprintf("Did not top best transcript match score (%.2f)", $best_score);
-            if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) {
+            if (!defined($unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) {
               $unmapped{$coord_xref_id}{'score'} = $score;
               $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID();
             }
           }
-        } elsif (exists( $unmapped{$coord_xref_id}) && $unmapped{$coord_xref_id}{'reason'} ne 'Was not best match') {
+        } elsif (exists($unmapped{$coord_xref_id}) && $unmapped{$coord_xref_id}{'reason'} ne 'Was not best match') {
           $unmapped{$coord_xref_id}{'reason'} = 'Did not meet threshold';
-          $unmapped{$coord_xref_id}{'reason_full'} = sprintf( "Match score for transcript lower than threshold (%.2f)", $transcript_score_threshold);
-          if (!defined( $unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) {
+          $unmapped{$coord_xref_id}{'reason_full'} = sprintf("Match score for transcript lower than threshold (%.2f)", $transcript_score_threshold);
+          if (!defined($unmapped{$coord_xref_id}{'score'}) || $score > $unmapped{$coord_xref_id}{'score'}) {
             $unmapped{$coord_xref_id}{'score'} = $score;
             $unmapped{$coord_xref_id}{'ensembl_id'} = $transcript->dbID();
           }
@@ -325,35 +320,21 @@
 
 sub parse_url {
   my ($url) = @_;
-
   my $parsed_url = Nextflow::Utils::parse($url);
-  my $user = $parsed_url->{'user'};
-  my $pass = $parsed_url->{'pass'};
-  my $host = $parsed_url->{'host'};
-  my $port = $parsed_url->{'port'};
-  my $db   = $parsed_url->{'dbname'};
-
-  return ($host, $port, $user, $pass, $db)
+  return @{$parsed_url}{qw(host port user pass dbname)};
 }
 
 sub get_dbi {
   my ($host, $port, $user, $pass, $dbname) = @_;
-
-  my $dbconn;
-  if (defined $dbname) {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname);
-  } else {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
-  }
-  my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr );
-
+  my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
+  my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr);
   return $dbi;
 }
 
 sub dump_xref {
   my ($filename, $xref_id, $mapped, $unmapped) = @_;
 
-  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+  my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename));
 
   foreach my $xref (values(%{$unmapped}), values(%{$mapped})) {
     # Assign 'xref_id' to this Xref.
@@ -382,7 +363,7 @@ sub dump_xref {
 sub dump_object_xref {
   my ($filename, $object_xref_id, $analysis_id, $mapped) = @_;
 
-  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+  my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename));
 
   foreach my $xref (values(%{$mapped})) {
     foreach my $object_xref (@{ $xref->{'mapped_to'} }) {
@@ -417,7 +398,7 @@ sub dump_unmapped_reason {
     }
   }
 
-  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+  my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename));
 
   my $sth = $core_dbi->prepare('SELECT unmapped_reason_id FROM unmapped_reason WHERE full_description = ?');
 
@@ -457,7 +438,7 @@ sub dump_unmapped_reason {
 sub dump_unmapped_object {
   my ($filename, $unmapped_object_id, $analysis_id, $unmapped) = @_;
 
-  my $fh = IO::File->new('>' . $filename) or croak(sprintf("Can not open '%s' for writing", $filename));
+  my $fh = IO::File->new($filename, 'w') or croak(sprintf("Can not open '%s' for writing", $filename));
 
   foreach my $xref (values(%{$unmapped})) {
     # Assign 'unmapped_object_id' to this Xref.
@@ -523,7 +504,7 @@ sub upload_data {
 
   my $load_sql = sprintf("LOAD DATA LOCAL INFILE ? REPLACE INTO TABLE %s", $table_name);
 
-  my $rows = $dbi->do($cleanup_sql, undef, $external_db_id) or croak($dbi->strerr());
+  my $rows = $dbi->do($cleanup_sql, undef, $external_db_id) or croak($dbi->errstr());
 
   $rows = $dbi->do($load_sql, undef, $filename) or croak($dbi->errstr());
 
diff --git a/scripts/xrefs/dump_ensembl.pl b/scripts/xrefs/dump_ensembl.pl
index 22132195d..1e035090d 100644
--- a/scripts/xrefs/dump_ensembl.pl
+++ b/scripts/xrefs/dump_ensembl.pl
@@ -34,21 +34,21 @@
 );
 
 # Check that all parameters are passed
-if (!defined($cdna_path) || !defined($pep_path) || !defined($species) || !defined($core_db_url) || !defined($release)) {
-  croak "Usage: dump_ensembl.pl --cdna_path <cdna_path> --pep_path <pep_path> --species <species> --core_db_url <core_db_url> --release <release>";
+foreach my $param ($cdna_path, $pep_path, $species, $core_db_url, $release) {
+  defined $param or croak "Usage: dump_ensembl.pl --cdna_path <cdna_path> --pep_path <pep_path> --species <species> --core_db_url <core_db_url> --release <release>";
 }
 
 # Open fasta files for writing
-my $cdna_fh = IO::File->new($cdna_path ,'w') || throw("Cannot create filehandle $cdna_path");
+my $cdna_fh = IO::File->new($cdna_path, 'w') or croak "Cannot create filehandle $cdna_path";
 my $cdna_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($cdna_fh);
-my $pep_fh = IO::File->new($pep_path ,'w') || throw("Cannot create filehandle $pep_path");
+my $pep_fh = IO::File->new($pep_path, 'w') or croak "Cannot create filehandle $pep_path";
 my $pep_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($pep_fh);
 
 # Load the registry
-my ($user, $pass, $host, $port, $dbname) = parse_url($core_db_url);
+my ($host, $port, $user, $pass, $dbname) = parse_url($core_db_url);
 my $registry = 'Bio::EnsEMBL::Registry';
 my %registry_params = (-HOST => $host, -PORT => $port, -USER => $user, -DB_VERSION => $release);
-$registry_params{-PASS} = $pass if ($pass);
+$registry_params{-PASS} = $pass if $pass;
 $registry->load_registry_from_db(%registry_params);
 
 # Get transcripts
@@ -56,14 +56,13 @@
 my $transcript_list = $transcript_adaptor->fetch_all();
 
 # Dump sequence data
-while (my $transcript = shift @$transcript_list) {
+foreach my $transcript (@$transcript_list) {
   my $sequence = $transcript->seq();
   $sequence->id($transcript->dbID());
   $cdna_writer->print_Seq($sequence);
 
   # Get and dump translation data
-  my $translation = $transcript->translation;
-  if ($translation) {
+  if (my $translation = $transcript->translation) {
     $sequence = $transcript->translate;
     $sequence->id($translation->dbID());
     $pep_writer->print_Seq($sequence);
@@ -77,10 +76,5 @@
 sub parse_url {
   my ($url) = @_;
   my $parsed_url = Nextflow::Utils::parse($url);
-  my $user = $parsed_url->{'user'};
-  my $pass = $parsed_url->{'pass'};
-  my $host = $parsed_url->{'host'};
-  my $port = $parsed_url->{'port'};
-  my $db   = $parsed_url->{'dbname'};
-  return ($user, $pass, $host, $port, $db);
-}
\ No newline at end of file
+  return @{$parsed_url}{qw(host port user pass dbname)};
+}
diff --git a/scripts/xrefs/refseq_coordinate_parser.pl b/scripts/xrefs/refseq_coordinate_parser.pl
index dae228391..39c9e4059 100644
--- a/scripts/xrefs/refseq_coordinate_parser.pl
+++ b/scripts/xrefs/refseq_coordinate_parser.pl
@@ -37,8 +37,8 @@
 );
 
 # Check that all parameters are passed
-if (!defined($xref_db_url) || !defined($core_db_url) || !defined($otherf_db_url) || !defined($source_ids_json) || !defined($species_id) || !defined($species_name) || !defined($release)) {
-  croak "Usage: dump_ensembl.pl --xref_db_url <xref_db_url> --core_db_url <core_db_url> --otherf_db_url <otherf_db_url> --source_ids <source_ids [json]> --species_id <species_id> --species_name <species_name> --release <release>";
+foreach my $param ($xref_db_url, $core_db_url, $otherf_db_url, $source_ids_json, $species_id, $species_name, $release) {
+  defined $param or croak "Usage: refseq_coordinate_parser.pl --xref_db_url <xref_db_url> --core_db_url <core_db_url> --otherf_db_url <otherf_db_url> --source_ids <source_ids [json]> --species_id <species_id> --species_name <species_name> --release <release>";
 }
 
 my $transcript_score_threshold = 0.75;
@@ -48,13 +48,12 @@
 my $source_ids = decode_json($source_ids_json);
 
 # Connect to the xref db
-my ($user, $pass, $host, $port, $xref_db) = parse_url($xref_db_url);
-my $dbi = get_dbi($host, $port, $user, $pass, $xref_db);
+my $dbi = get_dbi(parse_url($xref_db_url));
 
 # Load the registry
 my $registry = 'Bio::EnsEMBL::Registry';
-my ($core_user, $core_pass, $core_host, $core_port, $core_dbname) = parse_url($core_db_url);
-my ($otherf_user, $otherf_pass, $otherf_host, $otherf_port, $otherf_dbname) = parse_url($otherf_db_url);
+my ($core_host, $core_port, $core_user, $core_pass, $core_dbname) = parse_url($core_db_url);
+my ($otherf_host, $otherf_port, $otherf_user, $otherf_pass, $otherf_dbname) = parse_url($otherf_db_url);
 $registry->load_registry_from_multiple_dbs(
   {
     -host => $core_host,
@@ -73,11 +72,11 @@
 );
 
 # Get the EntrezGene and WikiGene accessions
-my (%entrez_ids) = %{ get_valid_codes("EntrezGene", $species_id, $dbi) };
-my (%wiki_ids)   = %{ get_valid_codes('WikiGene', $species_id, $dbi) };
+my %entrez_ids = %{ get_valid_codes("EntrezGene", $species_id, $dbi) };
+my %wiki_ids   = %{ get_valid_codes('WikiGene', $species_id, $dbi) };
 
 # Prepare link sql
-my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
+my $add_dependent_xref_sth = $dbi->prepare("INSERT IGNORE INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
 
 # Get the db adaptors
 my $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures');
@@ -98,8 +97,8 @@
 
 # Not all species have refseq_import data, skip if not found
 if (!defined $logic_name) {
-  print STDERR "No data found for RefSeq_import, skipping import\n";;
-  exit 1;
+  print STDERR "No data found for RefSeq_import, skipping import\n";
+  exit 0;
 }
 
 # Get otherfeatures chromosomes
@@ -170,7 +169,7 @@
           $start = $core_exon->seq_region_start();
           $end = $core_exon->seq_region_end();
           $overlap = $rr1->overlap_size('exon', $start, $end);
-          $core_exon_match += $overlap/($end - $start + 1);
+          $core_exon_match += $overlap / ($end - $start + 1);
           $rr2->check_and_register('exon', $start, $end);
         }
 
@@ -178,30 +177,30 @@
           $start = $core_tl_exon->seq_region_start();
           $end = $core_tl_exon->seq_region_end();
           $overlap = $rr3->overlap_size('exon', $start, $end);
-          $core_tl_exon_match += $overlap/($end - $start + 1);
+          $core_tl_exon_match += $overlap / ($end - $start + 1);
           $rr4->check_and_register('exon', $start, $end);
         }
 
-        # Look for oeverlap between the two sets of exons
+        # Look for overlap between the two sets of exons
         foreach my $otherf_exon (@$otherf_exons) {
           $start = $otherf_exon->seq_region_start();
           $end = $otherf_exon->seq_region_end();
           $overlap = $rr2->overlap_size('exon', $start, $end);
-          $otherf_exon_match += $overlap/($end - $start + 1);
+          $otherf_exon_match += $overlap / ($end - $start + 1);
         }
 
         foreach my $otherf_tl_exon (@$otherf_tl_exons) {
           $start = $otherf_tl_exon->seq_region_start();
           $end = $otherf_tl_exon->seq_region_end();
           $overlap = $rr4->overlap_size('exon', $start, $end);
-          $otherf_tl_exon_match += $overlap/($end - $start + 1);
+          $otherf_tl_exon_match += $overlap / ($end - $start + 1);
         }
 
         # Compare exon matching with number of exons to give a score
-        my $score = ( ($otherf_exon_match + $core_exon_match)) / (scalar(@$otherf_exons) + scalar(@$core_exons) );
+        my $score = ($otherf_exon_match + $core_exon_match) / (scalar(@$otherf_exons) + scalar(@$core_exons));
         my $tl_score = 0;
         if (scalar(@$otherf_tl_exons) > 0) {
-          $tl_score = ( ($otherf_tl_exon_match + $core_tl_exon_match)) / (scalar(@$otherf_tl_exons) + scalar(@$core_tl_exons) );
+          $tl_score = ($otherf_tl_exon_match + $core_tl_exon_match) / (scalar(@$otherf_tl_exons) + scalar(@$core_tl_exons));
         }
         if ($core_transcript->biotype eq $otherf_transcript->biotype) {
           $transcript_result{$core_transcript->stable_id} = $score;
@@ -216,7 +215,7 @@
       my ($best_id, $score, $tl_score);
 
       # Compare the scores based on coding exon overlap
-      # If there is a stale mate, chose best exon overlap score
+      # If there is a stale mate, choose best exon overlap score
       foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
         $score = $transcript_result{$tid};
         $tl_score = $tl_transcript_result{$tid};
@@ -234,7 +233,7 @@
               }
             }
           }
-          if (!defined $best_id) { 
+          if (!defined $best_id) {
             if ($score >= $best_score) {
               $best_id = $tid;
               $best_score = $score;
@@ -276,7 +275,7 @@
         my $translation = $transcript->translation();
 
         # Add link between Ensembl gene and EntrezGene (and WikiGene)
-        if (defined $entrez_ids{$entrez_id} ) {
+        if (defined $entrez_ids{$entrez_id}) {
           foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
             $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $source_ids->{'entrezgene'});
           }
@@ -288,7 +287,7 @@
         # Also store refseq protein as direct xref for ensembl translation, if translation exists
         if (defined $translation && defined $otherf_translation && ($otherf_translation->seq eq $translation->seq)) {
           my $translation_id = $otherf_translation->stable_id();
-          my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$otherf_translation->get_all_DBEntries};
+          my @xrefs = grep { $_->{dbname} eq 'GenBank' } @{$otherf_translation->get_all_DBEntries};
           if (scalar @xrefs == 1) {
             $translation_id = $xrefs[0]->primary_id();
           }
@@ -298,14 +297,14 @@
           $source_id = $source_ids->{'peptide'};
           $source_id = $source_ids->{'peptide_predicted'} if $acc =~ /^XP_/;
           my $tl_xref_id = add_xref({
-            acc => $acc,
-            version => $version,
-            label => $translation_id,
-            desc => undef,
-            source_id => $source_id,
+            acc        => $acc,
+            version    => $version,
+            label      => $translation_id,
+            desc       => undef,
+            source_id  => $source_id,
             species_id => $species_id,
-            dbi => $dbi,
-            info_type => 'DIRECT'
+            dbi        => $dbi,
+            info_type  => 'DIRECT'
           });
           add_direct_xref($tl_xref_id, $translation->stable_id(), "Translation", "", $dbi);
         }
@@ -316,39 +315,25 @@
 
 sub parse_url {
   my ($url) = @_;
-
   my $parsed_url = Nextflow::Utils::parse($url);
-  my $user = $parsed_url->{'user'};
-  my $pass = $parsed_url->{'pass'};
-  my $host = $parsed_url->{'host'};
-  my $port = $parsed_url->{'port'};
-  my $db   = $parsed_url->{'dbname'};
-
-  return ($user, $pass, $host, $port, $db);
+  return @{$parsed_url}{qw(host port user pass dbname)};
 }
 
 sub get_dbi {
   my ($host, $port, $user, $pass, $dbname) = @_;
-
-  my $dbconn;
-  if (defined $dbname) {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname);
-  } else {
-    $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
-  }
-  my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr );
-
+  my $dbconn = defined $dbname ? sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname) : sprintf("dbi:mysql:host=%s;port=%s", $host, $port);
+  my $dbi = DBI->connect($dbconn, $user, $pass, { 'RaiseError' => 1 }) or croak("Can't connect to database: " . $DBI::errstr);
   return $dbi;
 }
 
-sub get_valid_codes{
+sub get_valid_codes {
   my ($source_name, $species_id, $dbi) = @_;
 
   my %valid_codes;
   my @sources;
 
   my $big_name = uc $source_name;
-  my $sql = "select source_id from source where upper(name) like '%$big_name%'";
+  my $sql = "SELECT source_id FROM source WHERE UPPER(name) LIKE '%$big_name%'";
   my $sth = $dbi->prepare($sql);
   $sth->execute();
   while(my @row = $sth->fetchrow_array()){
@@ -357,7 +342,7 @@ sub get_valid_codes{
   $sth->finish;
 
   foreach my $source (@sources){
-    $sql = "select accession, xref_id from xref where species_id = $species_id and source_id = $source";
+    $sql = "SELECT accession, xref_id FROM xref WHERE species_id = $species_id AND source_id = $source";
     $sth = $dbi->prepare($sql);
     $sth->execute();
     while(my @row = $sth->fetchrow_array()){
@@ -395,7 +380,7 @@ sub add_xref {
     return $xref_id;
   }
 
-  my $add_xref_sth = $dbi->prepare('INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type, info_text) VALUES(?,?,?,?,?,?,?,?)');
+  my $add_xref_sth = $dbi->prepare('INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type, info_text) VALUES (?,?,?,?,?,?,?,?)');
 
   # If the description is more than 255 characters, chop it off
   if (defined $description && ((length $description) > 255 )) {
@@ -430,7 +415,7 @@ sub add_direct_xref {
   return;
 }
 
-sub get_direct_xref{
+sub get_direct_xref {
   my ($stable_id, $type, $link, $dbi) = @_;
 
   $type = lc $type;
diff --git a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
index a869c1266..c12ce0e6b 100644
--- a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
+++ b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
@@ -14,16 +14,17 @@
 
 """Xref module to print out advisory datachecks results (only needed now since we are still using perl datachecks)."""
 
-from ensembl.production.xrefs.Base import *
+import re
 
+from ensembl.production.xrefs.Base import Base
 
 class AdvisoryXrefReport(Base):
     def run(self):
-        base_path        = self.param_required("base_path", {"type": "str"})
-        species_name     = self.param_required("species_name", {"type": "str"})
-        release          = self.param_required("release", {"type": "int"})
-        datacheck_name   = self.param("datacheck_name", None, {"type": "str"})
-        datacheck_output = self.param("datacheck_output", None, {"type": "str"})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        datacheck_name: str = self.get_param("datacheck_name", {"type": str})
+        datacheck_output: str = self.get_param("datacheck_output", {"type": str})
 
         # Create or locate report file
         report_file = self.get_path(
diff --git a/src/python/ensembl/production/xrefs/Alignment.py b/src/python/ensembl/production/xrefs/Alignment.py
index b8ee417a1..5edac0b00 100644
--- a/src/python/ensembl/production/xrefs/Alignment.py
+++ b/src/python/ensembl/production/xrefs/Alignment.py
@@ -14,47 +14,64 @@
 
 """Alignment module to map xref sequences into ensEMBL ones."""
 
-from ensembl.production.xrefs.Base import *
+import re
+import subprocess
+from sqlalchemy.dialects.mysql import insert
 
+from ensembl.xrefs.xref_update_db_model import (
+    MappingJobs as MappingJobsORM,
+    Mapping as MappingORM,
+)
+
+from ensembl.production.xrefs.Base import Base
 
 class Alignment(Base):
+    XREF_HIT_PATTERN = re.compile(r"^xref")
+
     def run(self):
-        base_path     = self.param_required("base_path", {"type": "str"})
-        method        = self.param_required("align_method", {"type": "str"})
-        query_cutoff  = self.param_required("query_cutoff", {"type": "int"})
-        target_cutoff = self.param_required("target_cutoff", {"type": "int"})
-        max_chunks    = self.param_required("max_chunks", {"type": "int"})
-        chunk         = self.param_required("chunk", {"type": "int"})
-        job_index     = self.param_required("job_index", {"type": "int"})
-        source        = self.param_required("source_file", {"type": "str"})
-        target        = self.param_required("target_file", {"type": "str"})
-        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
-        map_file      = self.param_required("map_file", {"type": "str"})
-        source_id     = self.param_required("source_id", {"type": "int"})
-        seq_type      = self.param_required("seq_type", {"type": "str"})
+        method: str = self.get_param("align_method", {"required": True, "type": str})
+        query_cutoff: int = self.get_param("query_cutoff", {"required": True, "type": int})
+        target_cutoff: int = self.get_param("target_cutoff", {"required": True, "type": int})
+        max_chunks: int = self.get_param("max_chunks", {"required": True, "type": int})
+        chunk: int = self.get_param("chunk", {"required": True, "type": int})
+        job_index: int = self.get_param("job_index", {"required": True, "type": int})
+        source: str = self.get_param("source_file", {"required": True, "type": str})
+        target: str = self.get_param("target_file", {"required": True, "type": str})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        map_file: str = self.get_param("map_file", {"required": True, "type": str})
+        source_id: int = self.get_param("source_id", {"required": True, "type": int})
+        seq_type: str = self.get_param("seq_type", {"required": True, "type": str})
 
         # Construct Exonerate command
         ryo = "xref:%qi:%ti:%ei:%ql:%tl:%qab:%qae:%tab:%tae:%C:%s\n"
-        exe = (
-            subprocess.check_output("which exonerate", shell=True)
-            .decode("utf-8")
-            .strip()
-        )
-        command_string = f"{exe} --showalignment FALSE --showvulgar FALSE --ryo '{ryo}' --gappedextension FALSE --model 'affine:local' {method} --subopt no --query {source} --target {target} --querychunktotal {max_chunks} --querychunkid {chunk}"
+        exe = subprocess.check_output(["which", "exonerate"]).decode("utf-8").strip()
+        command_string = [
+            exe,
+            "--showalignment", "FALSE",
+            "--showvulgar", "FALSE",
+            "--ryo", f"'{ryo}'",
+            "--gappedextension", "FALSE",
+            "--model", "'affine:local'",
+            method,
+            "--subopt", "no",
+            "--query", source,
+            "--target", target,
+            "--querychunktotal", str(max_chunks),
+            "--querychunkid", str(chunk)
+        ]
 
         # Get exonerate hits
-        output = subprocess.run(command_string, shell=True, stdout=subprocess.PIPE)
+        output = subprocess.run(command_string, stdout=subprocess.PIPE, text=True)
 
         exit_code = abs(output.returncode)
         if exit_code == 0:
-            hits = output.stdout.decode("utf-8").split("\n")
+            hits = output.stdout.split("\n")
 
             # Write to mapping file
-            map_fh = open(map_file, "w")
-            for hit in hits:
-                if re.search(r"^xref", hit):
-                    map_fh.write(f"{hit}\n")
-            map_fh.close()
+            with open(map_file, "w") as map_fh:
+                for hit in hits:
+                    if self.XREF_HIT_PATTERN.search(hit):
+                        map_fh.write(f"{hit}\n")
         elif exit_code == 9:
             raise MemoryError(
                 f"Exonerate failed due to insufficient memory (exit code: {exit_code})"
diff --git a/src/python/ensembl/production/xrefs/CoordinateMapping.py b/src/python/ensembl/production/xrefs/CoordinateMapping.py
index d687ebee1..332e06c6e 100644
--- a/src/python/ensembl/production/xrefs/CoordinateMapping.py
+++ b/src/python/ensembl/production/xrefs/CoordinateMapping.py
@@ -14,26 +14,27 @@
 
 """Xref module to process the coordinate mappings."""
 
-from ensembl.production.xrefs.Base import *
-from ensembl.production.xrefs.mappers.CoordinateMapper import CoordinateMapper
+import logging
+from typing import Optional
 
+from ensembl.production.xrefs.Base import Base
+from ensembl.production.xrefs.mappers.CoordinateMapper import CoordinateMapper
 
 class CoordinateMapping(Base):
     def run(self):
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        scripts_dir  = self.param_required("perl_scripts_dir", {"type": "str"})
-        registry     = self.param("registry_url", None, {"type": "str"})
-        core_db_url  = self.param("species_db", None, {"type": "str"})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        scripts_dir: str = self.get_param("perl_scripts_dir", {"required": True, "type": str})
+        registry: Optional[str] = self.get_param("registry_url", {"type": str})
+        core_db_url: Optional[str] = self.get_param("species_db", {"type": str})
 
         logging.info(f"CoordinateMapping starting for species '{species_name}'")
 
+        # Retrieve core database URL if not provided
         if not core_db_url:
-            core_db_url = self.get_db_from_registry(
-                species_name, "core", release, registry
-            )
+            core_db_url = self.get_db_from_registry(species_name, "core", release, registry)
 
         # Get species id
         db_engine = self.get_db_engine(core_db_url)
@@ -41,9 +42,7 @@ def run(self):
             species_id = self.get_taxon_id(core_dbi)
 
         # Get the appropriate mapper
-        mapper = self.get_xref_mapper(
-            xref_db_url, species_name, base_path, release, core_db_url, registry
-        )
+        mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
 
         # Process the coordinate xrefs
         coord = CoordinateMapper(mapper)
diff --git a/src/python/ensembl/production/xrefs/DirectXrefs.py b/src/python/ensembl/production/xrefs/DirectXrefs.py
index f6522b274..121f4f897 100644
--- a/src/python/ensembl/production/xrefs/DirectXrefs.py
+++ b/src/python/ensembl/production/xrefs/DirectXrefs.py
@@ -14,25 +14,25 @@
 
 """Xref module to process direct xrefs."""
 
-from ensembl.production.xrefs.Base import *
-from ensembl.production.xrefs.mappers.DirectXrefsMapper import DirectXrefsMapper
+import logging
+from typing import Optional
 
+from ensembl.production.xrefs.Base import Base
+from ensembl.production.xrefs.mappers.DirectXrefsMapper import DirectXrefsMapper
 
 class DirectXrefs(Base):
     def run(self):
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        registry     = self.param("registry_url", None, {"type": "str"})
-        core_db_url  = self.param("species_db", None, {"type": "str"})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        registry: Optional[str] = self.get_param("registry_url", {"type": str})
+        core_db_url: Optional[str] = self.get_param("species_db", {"type": str})
 
         logging.info(f"DirectXrefs starting for species '{species_name}'")
 
         # Get the appropriate mapper
-        mapper = self.get_xref_mapper(
-            xref_db_url, species_name, base_path, release, core_db_url, registry
-        )
+        mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
 
         # Process the direct xrefs
         direct_mappings = DirectXrefsMapper(mapper)
diff --git a/src/python/ensembl/production/xrefs/DumpEnsembl.py b/src/python/ensembl/production/xrefs/DumpEnsembl.py
index 84ce39b47..c34635f6d 100644
--- a/src/python/ensembl/production/xrefs/DumpEnsembl.py
+++ b/src/python/ensembl/production/xrefs/DumpEnsembl.py
@@ -14,68 +14,65 @@
 
 """Dumping module to dump sequence data from a core db."""
 
-from ensembl.production.xrefs.Base import *
+import os
+import subprocess
+import logging
 
+from ensembl.production.xrefs.Base import Base
 
 class DumpEnsembl(Base):
     def run(self):
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        core_db_url  = self.param_required("species_db", {"type": "str"})
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        retry        = self.param("retry", None, {"type": "bool", "default": False})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        core_db_url: str = self.get_param("species_db", {"required": True, "type": str})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        retry: bool = self.get_param("retry", {"type": bool, "default": False})
 
         logging.info(f"DumpEnsembl starting for species '{species_name}'")
 
-        # Create files paths
-        cdna_path = self.get_path(
-            base_path, species_name, release, "ensembl", "transcripts.fa"
-        )
-        pep_path = self.get_path(
-            base_path, species_name, release, "ensembl", "peptides.fa"
-        )
+        # Create file paths
+        cdna_path = self.get_path(base_path, species_name, release, "ensembl", "transcripts.fa")
+        pep_path = self.get_path(base_path, species_name, release, "ensembl", "peptides.fa")
 
         # Check if dumping has been done for this run before, to speed up development by not having to re-dump sequences
-        if (
-            not retry
-            and os.path.exists(cdna_path)
-            and os.path.getsize(cdna_path) > 0
-            and os.path.exists(pep_path)
-            and os.path.getsize(pep_path) > 0
-        ):
-            logging.info(
-                f"Dna and peptide data already dumped for species '{species_name}', skipping."
-            )
+        if not retry and os.path.exists(cdna_path) and os.path.getsize(cdna_path) > 0 and os.path.exists(pep_path) and os.path.getsize(pep_path) > 0:
+            logging.info(f"Dna and peptide data already dumped for species '{species_name}', skipping.")
         else:
-            scripts_dir = self.param_required("perl_scripts_dir")
+            scripts_dir: str = self.get_param("perl_scripts_dir", {"required": True, "type": str})
 
             logging.info(f"Running perl script {scripts_dir}/dump_ensembl.pl")
-            perl_cmd = f"perl {scripts_dir}/dump_ensembl.pl --cdna_path '{cdna_path}' --pep_path '{pep_path}' --species {species_name} --core_db_url '{core_db_url}' --release {release}"
-            cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE)
+            perl_cmd = [
+                "perl",
+                f"{scripts_dir}/dump_ensembl.pl",
+                "--cdna_path", cdna_path,
+                "--pep_path", pep_path,
+                "--species", species_name,
+                "--core_db_url", core_db_url,
+                "--release", str(release)
+            ]
+            # subprocess.run(perl_cmd, check=True, stdout=subprocess.PIPE)
+            subprocess.run(perl_cmd, capture_output=True, text=True, check=True)
 
         # Create jobs for peptide dumping and alignment
-        dataflow_params = {
+        self.write_output("dump_xref", {
             "species_name": species_name,
             "file_path": pep_path,
             "xref_db_url": xref_db_url,
             "seq_type": "peptide",
-        }
-        self.write_output("dump_xref", dataflow_params)
+        })
 
         # Create jobs for cdna dumping and alignment
-        dataflow_params = {
+        self.write_output("dump_xref", {
             "species_name": species_name,
             "file_path": cdna_path,
             "xref_db_url": xref_db_url,
             "seq_type": "dna",
-        }
-        self.write_output("dump_xref", dataflow_params)
+        })
 
         # Create job for schedule mapping
-        dataflow_params = {
+        self.write_output("schedule_mapping", {
             "species_name": species_name,
             "xref_db_url": xref_db_url,
             "species_db": core_db_url,
-        }
-        self.write_output("schedule_mapping", dataflow_params)
+        })
diff --git a/src/python/ensembl/production/xrefs/DumpXref.py b/src/python/ensembl/production/xrefs/DumpXref.py
index 268c8cae2..94b7c1a04 100644
--- a/src/python/ensembl/production/xrefs/DumpXref.py
+++ b/src/python/ensembl/production/xrefs/DumpXref.py
@@ -14,22 +14,37 @@
 
 """Dumping module to dump xref sequence data from an xref intermediate db."""
 
-from ensembl.production.xrefs.Base import *
-
+import json
+import logging
+import os
+import re
+from sqlalchemy import select
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 
+from ensembl.xrefs.xref_update_db_model import (
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    PrimaryXref as PrimaryXrefORM,
+)
+
+from ensembl.production.xrefs.Base import Base
 
 class DumpXref(Base):
+    REFSEQ_DNA_PATTERN = re.compile(r"RefSeq_.*RNA")
+    REFSEQ_PEP_PATTERN = re.compile(r"RefSeq_peptide")
+    FILE_NAME_PATTERN = re.compile(r"\/")
+    SEQUENCE_PATTERN = re.compile(r"(J|O|U)")
+
     def run(self):
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        file_path    = self.param_required("file_path", {"type": "str"})
-        seq_type     = self.param_required("seq_type", {"type": "str"})
-        config_file  = self.param_required("config_file", {"type": "str"})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        file_path: str = self.get_param("file_path", {"required": True, "type": str})
+        seq_type: str = self.get_param("seq_type", {"required": True, "type": str})
+        config_file: str = self.get_param("config_file", {"required": True, "type": str})
 
         logging.info(
             f"DumpXref starting for species '{species_name}' with file_path '{file_path}' and seq_type '{seq_type}'"
@@ -42,19 +57,13 @@ def run(self):
         full_path = self.get_path(base_path, species_name, release, "xref")
 
         # Extract sources to download from config file
-        sources = []
         with open(config_file) as conf_file:
             sources = json.load(conf_file)
 
         # Create hash of available alignment methods
-        method = {}
-        query_cutoff = {}
-        target_cutoff = {}
-        for source in sources:
-            if source.get("method"):
-                method[source["name"]] = source["method"]
-                query_cutoff[source["name"]] = source.get("query_cutoff")
-                target_cutoff[source["name"]] = source.get("target_cutoff")
+        method = {source["name"]: source["method"] for source in sources if source.get("method")}
+        query_cutoff = {source["name"]: source.get("query_cutoff") for source in sources if source.get("method")}
+        target_cutoff = {source["name"]: source.get("target_cutoff") for source in sources if source.get("method")}
 
         job_index = 1
 
@@ -68,46 +77,42 @@ def run(self):
             source_name = source.name
             source_id = source.source_id
 
-            if re.search(r"RefSeq_.*RNA", source_name):
+            if self.REFSEQ_DNA_PATTERN.search(source_name):
                 source_name = "RefSeq_dna"
-            if re.search("RefSeq_peptide", source_name):
+            if self.REFSEQ_PEP_PATTERN.search(source_name):
                 source_name = "RefSeq_peptide"
 
-            if method.get(source_name):
+            if source_name in method:
                 method_name = method[source_name]
                 source_query_cutoff = query_cutoff[source_name]
                 source_target_cutoff = target_cutoff[source_name]
 
                 # Open fasta file
-                file_source_name = source.name
-                file_source_name = re.sub(r"\/", "", file_source_name)
+                file_source_name = self.FILE_NAME_PATTERN.sub("", source.name)
                 filename = os.path.join(
                     full_path, f"{seq_type}_{file_source_name}_{source_id}.fasta"
                 )
-                fasta_fh = open(filename, "w")
-
-                # Get xref sequences
-                sequence_query = select(
-                    PrimaryXrefORM.xref_id, PrimaryXrefORM.sequence
-                ).where(
-                    XrefUORM.xref_id == PrimaryXrefORM.xref_id,
-                    PrimaryXrefORM.sequence_type == seq_type,
-                    XrefUORM.source_id == source_id,
-                )
-                for sequence in xref_dbi.execute(sequence_query).mappings().all():
-                    # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes
-                    seq = sequence.sequence.upper()
-                    if seq_type == "peptide":
-                        seq = re.sub(r"(J|O|U)", "X", seq)
-
-                    # Print sequence
-                    SeqIO.write(
-                        SeqRecord(Seq(seq), id=str(sequence.xref_id), description=""),
-                        fasta_fh,
-                        "fasta",
+                with open(filename, "w") as fasta_fh:
+                    # Get xref sequences
+                    sequence_query = select(
+                        PrimaryXrefORM.xref_id, PrimaryXrefORM.sequence
+                    ).where(
+                        XrefUORM.xref_id == PrimaryXrefORM.xref_id,
+                        PrimaryXrefORM.sequence_type == seq_type,
+                        XrefUORM.source_id == source_id,
                     )
-
-                fasta_fh.close()
+                    for sequence in xref_dbi.execute(sequence_query).mappings().all():
+                        # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes
+                        seq = sequence.sequence.upper()
+                        if seq_type == "peptide":
+                            seq = self.SEQUENCE_PATTERN.sub("X", seq)
+
+                        # Print sequence
+                        SeqIO.write(
+                            SeqRecord(Seq(seq), id=str(sequence.xref_id), description=""),
+                            fasta_fh,
+                            "fasta",
+                        )
 
                 # Pass data into alignment jobs
                 self.write_output(
diff --git a/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py
index 3513c7afc..cc015ee0b 100644
--- a/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py
+++ b/src/python/ensembl/production/xrefs/EmailAdvisoryXrefReport.py
@@ -14,20 +14,22 @@
 
 """Email module to send user emails notifying of advisory DC failures."""
 
-from ensembl.production.xrefs.Base import *
+import os
+import re
 
 from smtplib import SMTP
 from email.message import EmailMessage
 
+from ensembl.production.xrefs.Base import Base
 
 class EmailAdvisoryXrefReport(Base):
     def run(self):
-        base_path     = self.param_required("base_path", {"type": "str"})
-        release       = self.param_required("release", {"type": "int"})
-        pipeline_name = self.param_required("pipeline_name", {"type": "str"})
-        email_address = self.param_required("email", {"type": "str"})
-        email_server  = self.param_required("email_server", {"type": "str"})
-        log_timestamp = self.param("log_timestamp", None, {"type": "str"})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        pipeline_name: str = self.get_param("pipeline_name", {"required": True, "type": str})
+        email_address: str = self.get_param("email", {"required": True, "type": str})
+        email_server: str = self.get_param("email_server", {"required": True, "type": str})
+        log_timestamp: str = self.get_param("log_timestamp", {"type": str})
 
         # Get the path and name of main reports file
         formatted_name = re.sub(r"\s", "_", pipeline_name)
@@ -38,40 +40,38 @@ def run(self):
         else:
             log_path = os.path.join(base_path, "logs")
             if not os.path.exists(log_path):
-                os.makedir(log_path)
+                os.makedirs(log_path)
             main_report_file_name = f"{main_report_file_name}.log"
 
         main_report_file = os.path.join(log_path, main_report_file_name)
-        main_fh = open(main_report_file, "a")
+        with open(main_report_file, "a") as main_fh:
 
-        species_with_reports = {}
+            species_with_reports = {}
 
-        # Get species in base path
-        species_list = os.listdir(base_path)
+            # Get species in base path
+            species_list = os.listdir(base_path)
 
-        for species in species_list:
-            # Check if reports exist
-            dc_path = os.path.join(base_path, species, release, "dc_report")
-            if os.path.exists(dc_path):
-                # Get report files
-                dc_files = os.listdir(dc_path)
+            for species in species_list:
+                # Check if reports exist
+                dc_path = os.path.join(base_path, species, str(release), "dc_report")
+                if os.path.exists(dc_path):
+                    # Get report files
+                    dc_files = os.listdir(dc_path)
 
-                # Add each dc report into main report file
-                for dc_file in dc_files:
-                    with open(os.path.join(dc_path, dc_file), "r") as file:
-                        dc_data = file.read()
+                    # Add each dc report into main report file
+                    for dc_file in dc_files:
+                        with open(os.path.join(dc_path, dc_file), "r") as file:
+                            dc_data = file.read()
 
-                    main_fh.write(f"{dc_data}\n")
+                        main_fh.write(f"{dc_data}\n")
 
-                    dc_name = dc_file.replace(".log", "")
-                    if species_with_reports.get(dc_name):
-                        species_with_reports[dc_name].append(species)
-                    else:
-                        species_with_reports[dc_name] = [species]
+                        dc_name = dc_file.replace(".log", "")
+                        if dc_name in species_with_reports:
+                            species_with_reports[dc_name].append(species)
+                        else:
+                            species_with_reports[dc_name] = [species]
 
-                # TO DO: maybe delete individual reports
-
-        main_fh.close()
+                    # TO DO: maybe delete individual reports
 
         email_message = f"Some advisory datachecks have failed for the following species in the xref pipeline run ({pipeline_name}).<br><br>"
         for dc_name, species_list in species_with_reports.items():
@@ -96,5 +96,5 @@ def run(self):
             file_data, maintype="text", subtype="plain", filename=main_report_file_name
         )
 
-        smtp = SMTP(email_server)
-        smtp.send_message(message)
+        with SMTP(email_server) as smtp:
+            smtp.send_message(message)
diff --git a/src/python/ensembl/production/xrefs/Mapping.py b/src/python/ensembl/production/xrefs/Mapping.py
index 838470c1a..a6f956d03 100644
--- a/src/python/ensembl/production/xrefs/Mapping.py
+++ b/src/python/ensembl/production/xrefs/Mapping.py
@@ -14,7 +14,9 @@
 
 """Mapping module to map the added xrefs into the core DB."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+
+from ensembl.production.xrefs.Base import Base
 from ensembl.production.xrefs.mappers.ProcessPriorities import ProcessPriorities
 from ensembl.production.xrefs.mappers.ProcessPaired import ProcessPaired
 from ensembl.production.xrefs.mappers.ProcessMoves import ProcessMoves
@@ -23,16 +25,15 @@
 from ensembl.production.xrefs.mappers.XrefLoader import XrefLoader
 from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
 
-
 class Mapping(Base):
     def run(self):
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        registry     = self.param("registry_url", None, {"type": "str"})
-        core_db_url  = self.param("species_db", None, {"type": "str"})
-        verbose      = self.param("verbose", None, {"default": False})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        registry: str = self.get_param("registry_url", {"type": str})
+        core_db_url: str = self.get_param("species_db", {"type": str})
+        verbose: bool = self.get_param("verbose", {"type": bool, "default": False})
 
         logging.info(f"Mapping starting for species '{species_name}'")
 
@@ -47,9 +48,7 @@ def run(self):
             species_id = self.get_taxon_id(core_dbi)
 
         # Get the appropriate mapper
-        mapper = self.get_xref_mapper(
-            xref_db_url, species_name, base_path, release, core_db_url, registry
-        )
+        mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
 
         # Process the xref priorities
         priorities = ProcessPriorities(mapper)
diff --git a/src/python/ensembl/production/xrefs/ParseSource.py b/src/python/ensembl/production/xrefs/ParseSource.py
index d3024fe20..eb35119b0 100644
--- a/src/python/ensembl/production/xrefs/ParseSource.py
+++ b/src/python/ensembl/production/xrefs/ParseSource.py
@@ -14,23 +14,27 @@
 
 """Parsing module to call specific file/db parsers based on xref source."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+import re
+import importlib
+from typing import Optional
 
+from ensembl.production.xrefs.Base import Base
 
 class ParseSource(Base):
-    def run(self):
-        parser_name  = self.param_required("parser", {"type": "str"})
-        species_name = self.param_required("species_name", {"type": "str"})
-        species_id   = self.param_required("species_id", {"type": "int"})
-        file_name    = self.param_required("file_name", {"type": "str"})
-        source_id    = self.param_required("source_id", {"type": "int"})
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        registry     = self.param_required("registry_url", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        core_db_url  = self.param_required("core_db_url", {"type": "str"})
-        db           = self.param("db", None, {"type": "str"})
-        release_file = self.param("release_file", None, {"type": "str"})
-        source_name  = self.param("source_name", None, {"type": "str"})
+    def run(self) -> None:
+        parser_name: str = self.get_param("parser", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        species_id: int = self.get_param("species_id", {"required": True, "type": int})
+        file_name: str = self.get_param("file_name", {"required": True, "type": str})
+        source_id: int = self.get_param("source_id", {"required": True, "type": int})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        registry_url: str = self.get_param("registry_url", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        core_db_url: str = self.get_param("core_db_url", {"required": True, "type": str})
+        db: Optional[str] = self.get_param("db", {"type": str})
+        release_file: Optional[str] = self.get_param("release_file", {"type": str})
+        source_name: Optional[str] = self.get_param("source_name", {"type": str})
 
         logging.info(
             f"ParseSource starting for source '{source_name}' with parser '{parser_name}' for species '{species_name}'"
@@ -54,22 +58,22 @@ def run(self):
 
         # Get the extra db, if any
         if db:
-            dba = self.param(f"{db}_db_url")
-            if not dba:
-                dba = self.get_db_from_registry(species_name, db, release, registry)
+            db_url = self.get_param(f"{db}_db_url", {"type": str})
+            if not db_url:
+                db_url = self.get_db_from_registry(species_name, db, release, registry_url)
 
-            args["dba"] = dba
+            args["extra_db_url"] = db_url
             args["ensembl_release"] = release
             args["core_db_url"] = core_db_url
 
         # For RefSeqCoordinate source, we run a perl script
         if parser_name == "RefSeqCoordinateParser":
-            args["perl_scripts_dir"] = self.param_required("perl_scripts_dir")
+            args["perl_scripts_dir"] = self.get_param("perl_scripts_dir", {"required": True, "type": str})
             args["xref_db_url"] = xref_db_url
 
         # For UniProt we need the hgnc file to extract descriptions
         if re.search(r"^UniProt", parser_name):
-            args['hgnc_file'] = self.param("hgnc_file", None, {"type": "str"})
+            args['hgnc_file'] = self.get_param("hgnc_file", {"type": str})
 
         # Import the parser
         module_name = f"ensembl.production.xrefs.parsers.{parser_name}"
@@ -77,7 +81,7 @@ def run(self):
         parser_class = getattr(module, parser_name)
         parser = parser_class()
 
-        (errors, message) = parser.run(args)
+        errors, message = parser.run(args)
         failure += errors
 
         xref_dbi.close()
diff --git a/src/python/ensembl/production/xrefs/ProcessAlignment.py b/src/python/ensembl/production/xrefs/ProcessAlignment.py
index 1f2295d43..289b37166 100644
--- a/src/python/ensembl/production/xrefs/ProcessAlignment.py
+++ b/src/python/ensembl/production/xrefs/ProcessAlignment.py
@@ -12,25 +12,24 @@
 
 """Xref module to process the sequence matched allignments."""
 
-from ensembl.production.xrefs.Base import *
-from ensembl.production.xrefs.mappers.ProcessMappings import ProcessMappings
+import logging
 
+from ensembl.production.xrefs.Base import Base
+from ensembl.production.xrefs.mappers.ProcessMappings import ProcessMappings
 
 class ProcessAlignment(Base):
     def run(self):
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        registry     = self.param("registry_url", None, {"type": "str"})
-        core_db_url  = self.param("species_db", None, {"type": "str"})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        registry: str = self.get_param("registry_url", {"type": str})
+        core_db_url: str = self.get_param("species_db", {"type": str})
 
         logging.info(f"ProcessAlignment starting for species '{species_name}'")
 
         # Get the appropriate mapper
-        mapper = self.get_xref_mapper(
-            xref_db_url, species_name, base_path, release, core_db_url, registry
-        )
+        mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
 
         # Process the alignments
         mappings = ProcessMappings(mapper)
diff --git a/src/python/ensembl/production/xrefs/RNACentralMapping.py b/src/python/ensembl/production/xrefs/RNACentralMapping.py
index e71353f50..16646495d 100644
--- a/src/python/ensembl/production/xrefs/RNACentralMapping.py
+++ b/src/python/ensembl/production/xrefs/RNACentralMapping.py
@@ -14,27 +14,26 @@
 
 """Xref module to process the RNAcentral mappings."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+
+from ensembl.production.xrefs.Base import Base
 from ensembl.production.xrefs.mappers.RNACentralMapper import RNACentralMapper
 from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum
 
-
 class RNACentralMapping(Base):
     def run(self):
-        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
-        species_name  = self.param_required("species_name", {"type": "str"})
-        base_path     = self.param_required("base_path", {"type": "str"})
-        release       = self.param_required("release", {"type": "int"})
-        source_db_url = self.param_required("source_db_url", {"type": "str"})
-        registry      = self.param("registry_url", None, {"type": "str"})
-        core_db_url   = self.param("species_db", None, {"type": "str"})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        registry: str = self.get_param("registry_url", {"type": str})
+        core_db_url: str = self.get_param("species_db", {"type": str})
 
         logging.info(f"RNACentralMapping starting for species '{species_name}'")
 
         if not core_db_url:
-            core_db_url = self.get_db_from_registry(
-                species_name, "core", release, registry
-            )
+            core_db_url = self.get_db_from_registry(species_name, "core", release, registry)
 
         # Get species id
         db_engine = self.get_db_engine(core_db_url)
@@ -43,15 +42,13 @@ def run(self):
 
         # Get the rna central mapper
         mapper = RNACentralMapper(
-            self.get_xref_mapper(
-                xref_db_url, species_name, base_path, release, core_db_url, registry
-            )
+            self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
         )
 
         # Get source id
         db_engine = self.get_db_engine(source_db_url)
         with db_engine.connect() as source_dbi:
-            source_id = self.get_source_id_from_name(source_dbi, "RNACentral")
+            source_id = self.get_source_id_from_name("RNACentral", source_dbi)
 
             method = MySQLChecksum({"MAPPER": mapper})
             results = method.run(
diff --git a/src/python/ensembl/production/xrefs/ScheduleAlignment.py b/src/python/ensembl/production/xrefs/ScheduleAlignment.py
index d1fca7697..36787edc4 100644
--- a/src/python/ensembl/production/xrefs/ScheduleAlignment.py
+++ b/src/python/ensembl/production/xrefs/ScheduleAlignment.py
@@ -14,24 +14,28 @@
 
 """Scheduling module to create xref/ensEMBL alignment jobs."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+import os
+from typing import Optional
 
+from ensembl.production.xrefs.Base import Base
 
 class ScheduleAlignment(Base):
     def run(self):
-        species_name  = self.param_required("species_name", {"type": "str"})
-        release       = self.param_required("release", {"type": "int"})
-        target_file   = self.param_required("ensembl_fasta", {"type": "str"})
-        source_file   = self.param_required("xref_fasta", {"type": "str"})
-        seq_type      = self.param_required("seq_type", {"type": "str"})
-        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
-        base_path     = self.param_required("base_path", {"type": "str"})
-        method        = self.param_required("method", {"type": "str"})
-        query_cutoff  = self.param_required("query_cutoff", {"type": "int"})
-        target_cutoff = self.param_required("target_cutoff", {"type": "int"})
-        source_id     = self.param_required("source_id", {"type": "int"})
-        source_name   = self.param_required("source_name", {"type": "str"})
-        job_index     = self.param_required("job_index", {"type": "int"})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        target_file: str = self.get_param("ensembl_fasta", {"required": True, "type": str})
+        source_file: str = self.get_param("xref_fasta", {"required": True, "type": str})
+        seq_type: str = self.get_param("seq_type", {"required": True, "type": str})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        method: str = self.get_param("method", {"required": True, "type": str})
+        query_cutoff: int = self.get_param("query_cutoff", {"required": True, "type": int})
+        target_cutoff: int = self.get_param("target_cutoff", {"required": True, "type": int})
+        source_id: int = self.get_param("source_id", {"required": True, "type": int})
+        source_name: str = self.get_param("source_name", {"required": True, "type": str})
+        job_index: int = self.get_param("job_index", {"required": True, "type": int})
+        chunk_size: Optional[int] = self.get_param("chunk_size", {"type": int, "default": 1000000})
 
         logging.info(
             f"ScheduleAlignment starting for species '{species_name}' with seq_type '{seq_type}' and job_index '{job_index}'"
@@ -39,14 +43,14 @@ def run(self):
 
         # Inspect file size to decide on chunking
         size = os.stat(target_file).st_size
-        chunks = int(size / 1000000) + 1
+        chunks = int(size / chunk_size) + 1
 
         # Create output path
         output_path = self.get_path(base_path, species_name, release, "alignment")
 
         # Pass alignment data for each chunk
         chunklet = 1
-        while chunklet <= chunks:
+        for chunklet in range(1, chunks + 1):
             output_path_chunk = os.path.join(
                 output_path,
                 f"{seq_type}_alignment_{source_id}_{chunklet}_of_{chunks}.map",
@@ -70,4 +74,3 @@ def run(self):
                     "seq_type": seq_type,
                 },
             )
-            chunklet += 1
diff --git a/src/python/ensembl/production/xrefs/ScheduleMapping.py b/src/python/ensembl/production/xrefs/ScheduleMapping.py
index 44032ad76..94085ce11 100644
--- a/src/python/ensembl/production/xrefs/ScheduleMapping.py
+++ b/src/python/ensembl/production/xrefs/ScheduleMapping.py
@@ -14,25 +14,24 @@
 
 """Scheduling module to xref mapping jobs."""
 
-from ensembl.production.xrefs.Base import *
-from ensembl.production.xrefs.mappers.CoreInfo import CoreInfo
+import logging
 
+from ensembl.production.xrefs.Base import Base
+from ensembl.production.xrefs.mappers.CoreInfo import CoreInfo
 
 class ScheduleMapping(Base):
     def run(self):
-        xref_db_url  = self.param_required("xref_db_url", {"type": "str"})
-        species_name = self.param_required("species_name", {"type": "str"})
-        base_path    = self.param_required("base_path", {"type": "str"})
-        release      = self.param_required("release", {"type": "int"})
-        registry     = self.param("registry_url", None, {"type": "str"})
-        core_db_url  = self.param("species_db", None, {"type": "str"})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        registry: str = self.get_param("registry_url", {"type": str})
+        core_db_url: str = self.get_param("species_db", {"type": str})
 
         logging.info(f"ScheduleMapping starting for species '{species_name}'")
 
         # Get the appropriate mapper
-        mapper = self.get_xref_mapper(
-            xref_db_url, species_name, base_path, release, core_db_url, registry
-        )
+        mapper = self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
 
         # Load the core data
         logging.info("Loading core data")
diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py
index cf044e1ee..149eb1c71 100644
--- a/src/python/ensembl/production/xrefs/ScheduleParse.py
+++ b/src/python/ensembl/production/xrefs/ScheduleParse.py
@@ -15,20 +15,30 @@
 """Scheduling module to create parsing jobs for each xref source."""
 
 import glob
+import logging
+import os
+import re
+from sqlalchemy import select
+from sqlalchemy.engine.url import make_url
+from typing import Tuple, Optional
 
-from ensembl.production.xrefs.Base import *
+from ensembl.xrefs.xref_source_db_model import (
+    Source as SourceSORM,
+    Version as VersionORM,
+)
 
+from ensembl.production.xrefs.Base import Base
 
 class ScheduleParse(Base):
     def run(self):
-        species_name     = self.param_required("species_name", {"type": "str"})
-        release          = self.param_required("release", {"type": "int"})
-        registry         = self.param_required("registry_url", {"type": "str"})
-        order_priority   = self.param_required("priority", {"type": "int"})
-        source_db_url    = self.param_required("source_db_url", {"type": "str"})
-        xref_db_url      = self.param_required("xref_db_url", {"type": "str"})
-        get_species_file = self.param_required("get_species_file", {"type": "bool"})
-        core_db_url      = self.param("species_db", None, {"type": "str"})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        registry: str = self.get_param("registry_url", {"required": True, "type": str})
+        order_priority: int = self.get_param("priority", {"required": True, "type": int})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        get_species_file: bool = self.get_param("get_species_file", {"required": True, "type": bool})
+        core_db_url: Optional[str] = self.get_param("species_db", {"type": str})
 
         logging.info(f"ScheduleParse starting for species '{species_name}'")
         logging.info(f"\tParam: order_priority = {order_priority}")
@@ -40,14 +50,30 @@ def run(self):
 
         # Create Xref database only at priority 1 (one time)
         if order_priority == 1:
-            sources_config_file = self.param_required("sources_config_file")
+            sources_config_file: str = self.get_param("sources_config_file", {"required": True, "type": str})
             logging.info(f"\tParam: sources_config_file = {sources_config_file}")
 
+            # Construct xref update db name, truncating if necessary
+            max_length = 64
+            original_species_name = species_name
+            xref_db_name = f"{species_name}_xref_update_{release}"
+            if len(xref_db_name) > max_length:
+                # Try to shorten the name by replacing "_collection" with "_col"
+                if species_name.endswith("_collection"):
+                    species_name = species_name.replace("_collection", "_col")
+                    xref_db_name = f"{species_name}_xref_update_{release}"
+
+                # If still too long, truncate the _xref_update_ part
+                if len(xref_db_name) > max_length:
+                    xref_db_name = f"{species_name}_xup_{release}"
+
+                # If still too long, raise an error
+                if len(xref_db_name) > max_length:
+                    raise ValueError(f"Could not sufficiently reduce DB name for species {species_name}")
+            species_name = original_species_name
+
             # Construct xref update url
-            xref_db_url = make_url(xref_db_url)
-            xref_db_url = xref_db_url.set(
-                database=f"{species_name}_xref_update_{release}"
-            )
+            xref_db_url = make_url(xref_db_url).set(database=xref_db_name)
             self.create_xref_db(xref_db_url, sources_config_file)
             xref_db_url = xref_db_url.render_as_string(hide_password=False)
 
@@ -68,13 +94,10 @@ def run(self):
                 species_name, "core", release, registry
             )
         if not re.search(r"^mysql://", core_db_url):
-            core_db_url = "mysql://" + core_db_url
+            core_db_url = f"mysql://{core_db_url}"
 
         # Get species and division ids
-        db_engine = self.get_db_engine(core_db_url)
-        with db_engine.connect() as core_dbi:
-            species_id = self.get_taxon_id(core_dbi)
-            division_id = self.get_division_id(core_dbi)
+        species_id, division_id = self.get_core_db_info(core_db_url)
 
         # Retrieve list of sources from source database
         db_engine = self.get_db_engine(source_db_url)
@@ -104,9 +127,7 @@ def run(self):
             if source.name == "HGNC":
                 hgnc_path = source.file_path
 
-            if source.db == "checksum":
-                continue
-            if source.priority != order_priority:
+            if source.db == "checksum" or source.priority != order_priority:
                 continue
 
             dataflow_params = {
@@ -117,9 +138,7 @@ def run(self):
             }
 
             # Use clean files if available
-            file_name = source.file_path
-            if source.clean_path:
-                file_name = source.clean_path
+            file_name = source.clean_path or source.file_path
 
             # Some sources are species-specific
             source_id = self.get_source_id(
@@ -128,11 +147,12 @@ def run(self):
             if not source_id:
                 continue
 
-            dataflow_params["source_id"] = source_id
-            dataflow_params["source_name"] = source.name
-            dataflow_params["parser"] = source.parser
-            if source.revision:
-                dataflow_params["release_file"] = source.revision
+            dataflow_params.update({
+                "source_id": source_id,
+                "source_name": source.name,
+                "parser": source.parser,
+                "release_file": source.revision if source.revision else None,
+            })
 
             # Some sources need a connection to a special database
             if source.db:
@@ -163,30 +183,24 @@ def run(self):
                 total_sources += 1
             else:
                 # Get list of files if directory
-                if os.path.isdir(file_name):
-                    list_files = os.listdir(file_name)
-                    list_files = [os.path.join(file_name, f) for f in list_files]
-                else:
-                    list_files = [file_name]
+                list_files = (
+                    [os.path.join(file_name, f) for f in os.listdir(file_name)]
+                    if os.path.isdir(file_name)
+                    else [file_name]
+                )
 
                 # For Uniprot and Refseq, files might have been split by species
                 if get_species_file:
-                    match source.name:
-                        case "Uniprot/SWISSPROT":
-                            file_prefix = "uniprot_sprot"
-                        case "Uniprot/SPTREMBL":
-                            file_prefix = "uniprot_trembl"
-                        case "RefSeq_dna":
-                            file_prefix = "refseq_rna"
-                        case "RefSeq_peptide":
-                            file_prefix = "refseq_protein"
-                        case _:
-                            file_prefix = None
+                    file_prefix = {
+                        "Uniprot/SWISSPROT": "uniprot_sprot",
+                        "Uniprot/SPTREMBL": "uniprot_trembl",
+                        "RefSeq_dna": "refseq_rna",
+                        "RefSeq_peptide": "refseq_protein",
+                    }.get(source.name)
 
                     if file_prefix:
                         list_files = glob.glob(
-                            file_name + "/**/" + file_prefix + "-" + str(species_id),
-                            recursive=True,
+                            f"{file_name}/**/{file_prefix}-{species_id}", recursive=True
                         )
 
                 if source.name == "ZFIN_ID":
@@ -198,7 +212,8 @@ def run(self):
 
                     dataflow_params["file_name"] = file
 
-                    if re.search(r"^Uniprot", source.name):
+                    if re.search(r"^Uniprot", source.name) and hgnc_path:
+                        
                         hgnc_files = glob.glob(hgnc_path + "/*")
                         dataflow_params["hgnc_file"] = hgnc_files[0]
 
@@ -208,8 +223,7 @@ def run(self):
         xref_dbi.close()
 
         if total_sources == 0:
-            with open(f"dataflow_{dataflow_suffix}.json", "a") as fh:
-                fh.write("")
+            self.write_output(dataflow_suffix, {})
 
         dataflow_params = {
             "species_name": species_name,
@@ -217,3 +231,11 @@ def run(self):
             "xref_db_url": xref_db_url,
         }
         self.write_output(dataflow_sub_suffix, dataflow_params)
+
+    def get_core_db_info(self, core_db_url: str) -> Tuple[int, int]:
+        db_engine = self.get_db_engine(core_db_url)
+        with db_engine.connect() as core_dbi:
+            species_id = self.get_taxon_id(core_dbi)
+            division_id = self.get_division_id(core_dbi)
+
+        return species_id, division_id
diff --git a/src/python/ensembl/production/xrefs/ScheduleSpecies.py b/src/python/ensembl/production/xrefs/ScheduleSpecies.py
index e63de241a..9537fa452 100644
--- a/src/python/ensembl/production/xrefs/ScheduleSpecies.py
+++ b/src/python/ensembl/production/xrefs/ScheduleSpecies.py
@@ -14,20 +14,24 @@
 
 """Scheduling module to create a pipeline branch for each species in list or division."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+import re
+import requests
+from typing import List, Dict, Optional
 
+from ensembl.production.xrefs.Base import Base
 
 class ScheduleSpecies(Base):
     def run(self):
-        run_all         = self.param_required("run_all", {"type": "bool"})
-        registry        = self.param_required("registry_url", {"type": "str"})
-        ensembl_release = self.param_required("release", {"type": "int"})
-        metasearch_url  = self.param_required("metasearch_url", {"type": "str"})
-        species         = self.param("species", None, {"default": "", "type": "str"})
-        antispecies     = self.param("antispecies", None, {"default": "", "type": "str"})
-        division        = self.param("division", None, {"default": "", "type": "str"})
-        db_prefix       = self.param("db_prefix", None, {"type": "str"})
-        group           = self.param("group", None, {"default": "core", "type": "str"})
+        run_all: bool = self.get_param("run_all", {"required": True, "type": bool})
+        registry: str = self.get_param("registry_url", {"required": True, "type": str})
+        ensembl_release: int = self.get_param("release", {"required": True, "type": int})
+        metasearch_url: str = self.get_param("metasearch_url", {"required": True, "type": str})
+        species: list = self.get_param("species", {"default": [], "type": list})
+        antispecies: list = self.get_param("antispecies", {"default": [], "type": list})
+        division: list = self.get_param("division", {"default": [], "type": list})
+        group: str = self.get_param("group", {"default": "core", "type": str})
+        db_prefix: Optional[str] = self.get_param("db_prefix", {"type": str})
 
         logging.info("ScheduleSpecies starting with parameters:")
         logging.info(f"\tParam: run_all = {run_all}")
@@ -40,102 +44,49 @@ def run(self):
         logging.info(f"\tParam: db_prefix = {db_prefix}")
         logging.info(f"\tParam: group = {group}")
 
-        if species:
-            species = species.split(",")
-        if antispecies:
-            antispecies = antispecies.split(",")
-        if division:
-            division = division.split(",")
-        ensembl_release = str(ensembl_release)
-
         # Fix registry url, if needed
-        match = re.search(r"^(.*)://(.*)", registry)
-        if match:
-            registry = match.group(2)
-        match = re.search(r"(.*)/(.*)", registry)
-        if match:
-            registry = match.group(1)
+        registry = self._fix_registry_url(registry)
 
         loaded_dbs = {}
         dbs = []
 
         # Construct the db name pattern
-        name_pattern = f"%_{group}%"
-        if db_prefix:
-            db_prefix = f"{db_prefix}_"
-        else:
-            db_prefix = ""
-        name_pattern = f"{db_prefix}{name_pattern}"
+        db_prefix = f"{db_prefix}_" if db_prefix else ""
+        name_pattern = f"{db_prefix}%_{group}%"
 
         # Getting all dbs
         if run_all:
-            metasearch_body = {
-                "name_pattern": name_pattern,
-                "filters": [
-                    {"meta_key": "schema_version", "meta_value": ensembl_release},
-                ],
-                "servers": [registry],
-            }
-
-            # Query registry for all core dbs
-            dbs = requests.post(metasearch_url, json=metasearch_body).json()
-            dbs = dbs[registry]
-
+            dbs = self._query_registry(metasearch_url, name_pattern, ensembl_release, registry)
             loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release)
 
         # Getting dbs for specified species
-        elif species and len(species) > 0:
+        elif species:
             for species_name in species:
-                name_pattern = f"{species_name}_core%"
-                name_pattern = f"{db_prefix}{name_pattern}"
-
-                metasearch_body = {
-                    "name_pattern": name_pattern,
-                    "filters": [
-                        {"meta_key": "schema_version", "meta_value": ensembl_release},
-                    ],
-                    "servers": [registry],
-                }
-
-                # Query registry for species dbs
-                species_dbs = requests.post(metasearch_url, json=metasearch_body).json()
-
-                if len(species_dbs[registry]) < 1:
-                    raise IOError(
-                        f"Database not found for {species_name}, check registry parameters"
-                    )
-                else:
-                    dbs = dbs + species_dbs[registry]
+                species_pattern = f"{db_prefix}{species_name}_core%"
+                species_dbs = self._query_registry(metasearch_url, species_pattern, ensembl_release, registry)
+                if not species_dbs:
+                    raise LookupError(f"Database not found for {species_name}, check registry parameters")
+                dbs.extend(species_dbs)
 
             loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release)
 
             # Check if all wanted species were found
-            for species_name in species:
-                if not loaded_dbs.get(species_name):
-                    raise IOError(
-                        f"Database not found for {species_name}, check registry parameters"
-                    )
+            self._check_species_found(species, loaded_dbs)
 
         # Getting dbs for specified divisions
-        elif division and len(division) > 0:
+        elif division:
             for div in division:
-                metasearch_body = {
-                    "name_pattern": name_pattern,
-                    "filters": [
-                        {"meta_key": "schema_version", "meta_value": ensembl_release},
-                        {"meta_key": "species.division", "meta_value": div},
-                    ],
-                    "servers": [registry],
-                }
-
-                # Query registry for dbs in division
-                div_dbs = requests.post(metasearch_url, json=metasearch_body).json()
-                dbs = dbs + div_dbs[registry]
+                div_dbs = self._query_registry(metasearch_url, name_pattern, ensembl_release, registry, div)
+                dbs.extend(div_dbs)
 
             loaded_dbs = self.check_validity(dbs, db_prefix, group, ensembl_release)
 
-        if len(loaded_dbs) == 0:
-            raise IOError(f"Could not find any matching dbs in registry {registry}")
+        # No species or division specified with run_all set to False
+        else:
+            raise ValueError("Must provide species or division with run_all set to False")
+
+        if not loaded_dbs:
+            raise LookupError(f"Could not find any matching dbs in registry {registry}")
 
         if run_all:
             logging.info(f"All species in {len(loaded_dbs)} databases loaded")
@@ -143,35 +94,59 @@ def run(self):
         # Write dataflow output
         for species_name, db in loaded_dbs.items():
             if species_name not in antispecies:
-                self.write_output(
-                    "species", {"species_name": species_name, "species_db": db}
-                )
+                self.write_output("species", {"species_name": species_name, "species_db": db})
 
-    def check_validity(self, dbs: List(str), prefix: str, group: str, release: str):
+    def _fix_registry_url(self, registry: str) -> str:
+        match = re.search(r"^(.*)://(.*)", registry)
+        if match:
+            registry = match.group(2)
+        match = re.search(r"(.*)/(.*)$", registry)
+        if match:
+            registry = match.group(1)
+        return registry
+
+    def _query_registry(self, metasearch_url: str, name_pattern: str, ensembl_release: int, registry: str, division: str = None) -> List[str]:
+        ensembl_release_str = str(ensembl_release)
+
+        filters = [
+            {
+                "meta_key": "schema_version",
+                "meta_value": ensembl_release_str
+            }
+        ]
+
+        if division:
+            filters.append({"meta_key": "species.division", "meta_value": division})
+
+        metasearch_body = {
+            "name_pattern": name_pattern,
+            "filters": filters,
+            "servers": [registry],
+        }
+        response = requests.post(metasearch_url, json=metasearch_body).json()
+        return response.get(registry, [])
+
+    def _check_species_found(self, species_list: List[str], loaded_dbs: Dict[str, str]):
+        for species_name in species_list:
+            if species_name not in loaded_dbs:
+                raise LookupError(f"Database not found for {species_name}, check registry parameters")
+
+    def check_validity(self, dbs: List[str], prefix: str, group: str, release: int) -> Dict[str, str]:
         valid_dbs = {}
 
         for db in dbs:
             # Extract db name
-            db_name = db
-            match = re.search(r"(.*)/(.*)", db_name)
-            if match:
-                db_name = match.group(2)
+            db_name = re.search(r"(.*)/(.*)$", db).group(2) if re.search(r"(.*)/(.*)$", db) else db
 
             # Check if db is valid
-            match = re.search(
-                r"^(%s)([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_%s(?:_\d+)?_%s_(\w+)$"
-                % (prefix, group, release),
-                db_name,
-            )
+            match = re.search(rf"^{prefix}([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)_{group}(?:_\d+)?_{release}_(\w+)$", db_name)
             if match:
-                species_name = match.group(2)
-                if not valid_dbs.get(species_name):
+                species_name = match.group(1)
+                if species_name not in valid_dbs:
                     logging.info(f"Species {species_name} loaded")
                     valid_dbs[species_name] = db
                 else:
-                    raise IOError(
-                        f"Database {valid_dbs[species_name]} already loaded for species {species_name}, cannot load second database {db}"
-                    )
+                    raise ValueError(f"Database {valid_dbs[species_name]} already loaded for species {species_name}, cannot load second database {db}")
             else:
                 logging.info(f"Could not extract species name from database {db}")
 
diff --git a/src/python/ensembl/production/xrefs/UniParcMapping.py b/src/python/ensembl/production/xrefs/UniParcMapping.py
index 86668b621..07beaa923 100644
--- a/src/python/ensembl/production/xrefs/UniParcMapping.py
+++ b/src/python/ensembl/production/xrefs/UniParcMapping.py
@@ -14,27 +14,26 @@
 
 """Xref module to process the Uniparc mappings."""
 
-from ensembl.production.xrefs.Base import *
+import logging
+
+from ensembl.production.xrefs.Base import Base
 from ensembl.production.xrefs.mappers.UniParcMapper import UniParcMapper
 from ensembl.production.xrefs.mappers.methods.MySQLChecksum import MySQLChecksum
 
-
 class UniParcMapping(Base):
     def run(self):
-        xref_db_url   = self.param_required("xref_db_url", {"type": "str"})
-        species_name  = self.param_required("species_name", {"type": "str"})
-        base_path     = self.param_required("base_path", {"type": "str"})
-        release       = self.param_required("release", {"type": "int"})
-        source_db_url = self.param_required("source_db_url", {"type": "str"})
-        registry      = self.param("registry_url", None, {"type": "str"})
-        core_db_url   = self.param("species_db", None, {"type": "str"})
+        xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
+        species_name: str = self.get_param("species_name", {"required": True, "type": str})
+        base_path: str = self.get_param("base_path", {"required": True, "type": str})
+        release: int = self.get_param("release", {"required": True, "type": int})
+        source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
+        registry: str = self.get_param("registry_url", {"type": str})
+        core_db_url: str = self.get_param("species_db", {"type": str})
 
         logging.info(f"UniParcMapping starting for species '{species_name}'")
 
         if not core_db_url:
-            core_db_url = self.get_db_from_registry(
-                species_name, "core", release, registry
-            )
+            core_db_url = self.get_db_from_registry(species_name, "core", release, registry)
 
         # Get species id
         db_engine = self.get_db_engine(core_db_url)
@@ -43,15 +42,13 @@ def run(self):
 
         # Get the uniparc mapper
         mapper = UniParcMapper(
-            self.get_xref_mapper(
-                xref_db_url, species_name, base_path, release, core_db_url, registry
-            )
+            self.get_xref_mapper(xref_db_url, species_name, base_path, release, core_db_url, registry)
         )
 
         # Get source id
         db_engine = self.get_db_engine(source_db_url)
         with db_engine.connect() as source_dbi:
-            source_id = self.get_source_id_from_name(source_dbi, "UniParc")
+            source_id = self.get_source_id_from_name("UniParc", source_dbi)
 
             method = MySQLChecksum({"MAPPER": mapper})
             results = method.run(
diff --git a/src/python/ensembl/production/xrefs/mappers/BasicMapper.py b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py
index 362eea354..bd563505d 100644
--- a/src/python/ensembl/production/xrefs/mappers/BasicMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/BasicMapper.py
@@ -14,74 +14,27 @@
 
 """Base module to handle xref mapping."""
 
-import re
-import os
-import sys
-import warnings
 import logging
-import subprocess
 
-from sqlalchemy import select, insert, update, func, delete, desc, text
+from sqlalchemy import select, insert, update, delete
 from sqlalchemy.engine import Engine, Connection
-from sqlalchemy.orm import Session, sessionmaker, aliased
-from sqlalchemy.sql.expression import case
-from sqlalchemy.sql import Select
-from typing import Dict, Any, List, Optional, Tuple
-
-from ensembl.core.models import (
-    Gene as GeneORM,
-    Transcript as TranscriptORM,
-    Translation as TranslationORM,
-    Meta as MetaCORM,
-    AltAllele as AltAlleleCORM,
-    t_alt_allele_attrib as AltAlleleAttribORM,
-    ObjectXref as ObjectXrefCORM,
-    Xref as XrefCORM,
-    ExternalDb as ExternalDbORM,
-    UnmappedObject as UnmappedObjectORM,
-    UnmappedReason as UnmappedReasonORM,
-    Analysis as AnalysisORM,
-    OntologyXref as OntologyXrefORM,
-    ExternalSynonym as ExternalSynonymORM,
-    DependentXref as DependentXrefCORM,
-    IdentityXref as IdentityXrefCORM,
-    SeqRegionAttrib as SeqRegionAttribORM,
-    AttribType as AttribTypeORM,
-)
+from typing import Dict, Any, Optional
+from datetime import datetime
 
 from ensembl.xrefs.xref_update_db_model import (
     GeneTranscriptTranslation as GeneTranscriptTranslationORM,
-    GeneStableId as GeneStableIdORM,
-    TranscriptStableId as TranscriptStableIdORM,
-    TranslationStableId as TranslationStableIdORM,
     Meta as MetaUORM,
     ProcessStatus as ProcessStatusORM,
     ObjectXref as ObjectXrefUORM,
-    AltAllele as AltAlleleUORM,
     Source as SourceUORM,
     Xref as XrefUORM,
     IdentityXref as IdentityXrefUORM,
-    DependentXref as DependentXrefUORM,
-    GeneDirectXref as GeneDirectXrefORM,
-    TranscriptDirectXref as TranscriptDirectXrefORM,
-    TranslationDirectXref as TranslationDirectXrefORM,
-    Mapping as MappingORM,
-    MappingJobs as MappingJobsORM,
-    CoordinateXref as CoordinateXrefORM,
-    Synonym as SynonymORM,
-    Pairs as PairsORM,
-    PrimaryXref as PrimaryXrefORM,
-    DisplayXrefPriority as DisplayXrefPriorityORM,
-    GeneDescPriority as GeneDescPriorityORM,
+    DependentXref as DependentXrefUORM
 )
 
-from datetime import datetime
-
-
 class BasicMapper:
-    def __init__(self, args: Dict[str, Any] = None) -> None:
-        if args is None:
-            args = {}
+    def __init__(self, args: Optional[Dict[str, Any]] = None) -> None:
+        args = args or {}
 
         self._xref = args.get("xref")
         self._core = args.get("core")
@@ -90,7 +43,7 @@ def __init__(self, args: Dict[str, Any] = None) -> None:
         self._log_file = args.get("log_file")
         self._species_dir = args.get("species_dir")
 
-    def xref(self, xref_db_engine: Engine = None) -> Engine:
+    def xref(self, xref_db_engine: Optional[Engine] = None) -> Engine:
         """Getter/Setter for the xref DB engine.
 
         Parameters
@@ -100,14 +53,15 @@ def xref(self, xref_db_engine: Engine = None) -> Engine:
 
         Returns
         -------
-        The xref DB engine.
+        Engine
+            The xref DB engine.
         """
         if xref_db_engine:
             self._xref = xref_db_engine
 
         return self._xref
 
-    def core(self, core_db_engine: Engine = None) -> Engine:
+    def core(self, core_db_engine: Optional[Engine] = None) -> Engine:
         """Getter/Setter for the core DB engine.
 
         Parameters
@@ -117,14 +71,15 @@ def core(self, core_db_engine: Engine = None) -> Engine:
 
         Returns
         -------
-        The core DB engine.
+        Engine
+            The core DB engine.
         """
         if core_db_engine:
             self._core = core_db_engine
 
         return self._core
 
-    def dna_file(self, dna_file: str = None) -> str:
+    def dna_file(self, dna_file: Optional[str] = None) -> str:
         """Getter/Setter for the dna file.
 
         Parameters
@@ -134,14 +89,15 @@ def dna_file(self, dna_file: str = None) -> str:
 
         Returns
         -------
-        The dna file path
+        Optional[str]
+            The dna file path
         """
-        if dna_file:
+        if dna_file is not None:
             self._dna_file = dna_file
 
         return self._dna_file
 
-    def protein_file(self, protein_file: str = None) -> str:
+    def protein_file(self, protein_file: Optional[str] = None) -> str:
         """Getter/Setter for the protein file.
 
         Parameters
@@ -151,14 +107,15 @@ def protein_file(self, protein_file: str = None) -> str:
 
         Returns
         -------
-        The protein file path
+        Optional[str]
+            The protein file path
         """
-        if protein_file:
+        if protein_file is not None:
             self._protein_file = protein_file
 
         return self._protein_file
 
-    def log_file(self, log_file: str = None) -> str:
+    def log_file(self, log_file: Optional[str] = None) -> str:
         """Getter/Setter for the log file.
 
         Parameters
@@ -168,14 +125,15 @@ def log_file(self, log_file: str = None) -> str:
 
         Returns
         -------
-        The log file path
+        Optional[str]
+            The log file path
         """
-        if log_file:
+        if log_file is not None:
             self._log_file = log_file
 
         return self._log_file
 
-    def species_dir(self, species_dir: str = None) -> str:
+    def species_dir(self, species_dir: Optional[str] = None) -> str:
         """Getter/Setter for the species directory.
 
         Parameters
@@ -185,14 +143,16 @@ def species_dir(self, species_dir: str = None) -> str:
 
         Returns
         -------
-        The species directory
+        Optional[str]
+            The species directory
         """
-        if species_dir:
+        if species_dir is not None:
             self._species_dir = species_dir
 
         return self._species_dir
 
     def official_name(self) -> None:
+        """Returns the official name."""
         return None
 
     def add_meta_pair(self, meta_key: str, meta_value: str) -> None:
@@ -205,7 +165,7 @@ def add_meta_pair(self, meta_key: str, meta_value: str) -> None:
         meta_value: str
             The value of the 'meta_value' column in the meta table
         """
-        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        now = datetime.now()
 
         with self.xref().connect() as dbi:
             dbi.execute(
@@ -214,13 +174,18 @@ def add_meta_pair(self, meta_key: str, meta_value: str) -> None:
                 )
             )
 
-    def get_meta_value(self, meta_key: str) -> str:
+    def get_meta_value(self, meta_key: str) -> Optional[str]:
         """Gets a value from the meta table based on key.
 
         Parameters
         ----------
         meta_key: str
             The value of the 'meta_key' column in the meta table
+
+        Returns
+        -------
+        Optional[str]
+            The value of the 'meta_value' column if found, else None
         """
         with self.xref().connect() as dbi:
             query = (
@@ -228,11 +193,9 @@ def get_meta_value(self, meta_key: str) -> str:
                 .where(MetaUORM.meta_key == meta_key)
                 .order_by(MetaUORM.meta_id.desc())
             )
-            value = dbi.execute(query).first()
+            result = dbi.execute(query).first()
 
-        if value:
-            value = value[0]
-        return value
+        return result[0] if result else None
 
     def update_process_status(self, status: str) -> None:
         """Adds a row to the process_status table.
@@ -240,19 +203,23 @@ def update_process_status(self, status: str) -> None:
         Parameters
         ----------
         status: str
-            The value of the 'status' column on the process_status table
+            The value of the 'status' column in the process_status table
         """
-        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        now = datetime.now()
 
         with self.xref().connect() as dbi:
-            dbi.execute(insert(ProcessStatusORM).values(status=status, date=now))
+            dbi.execute(
+                insert(ProcessStatusORM).values(status=status, date=now)
+            )
 
     def set_up_logging(self) -> None:
+        """Sets up logging for the mapper."""
         log_file = self.log_file()
 
+        # Create handlers and set levels
         console_handler = logging.StreamHandler()
-        file_handler = logging.FileHandler(log_file, mode="a")
         console_handler.setLevel(logging.WARNING)
+        file_handler = logging.FileHandler(log_file, mode="a")
         file_handler.setLevel(logging.DEBUG)
 
         logging.basicConfig(
@@ -263,9 +230,16 @@ def set_up_logging(self) -> None:
         )
 
     def log_progress(self, message: str) -> None:
+        """Logs a message to the console and log file.
+
+        Parameters
+        ----------
+        message: str
+            The message to log
+        """
         logging.info(message)
 
-    def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int:
+    def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: Optional[int] = None, status: Optional[str] = None) -> Optional[int]:
         """Retrieves the object_xref row ID from ensembl ID, xref ID, ensembl type, and linkage type.
 
         Parameters
@@ -282,15 +256,14 @@ def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, l
             The xref ID of the xref that this object xref is dependent on
         status: str, optional
             The object xref status
-        dbi: sqlalchemy.engine.Connection
+        dbi: Connection
             The database connection to query in
 
         Returns
         -------
-        The object xref ID, if found (else None).
+        Optional[int]
+            The object xref ID, if found (else None).
         """
-        object_xref_id = None
-
         query = select(ObjectXrefUORM.object_xref_id).where(
             ObjectXrefUORM.ensembl_id == ensembl_id,
             ObjectXrefUORM.xref_id == xref_id,
@@ -302,14 +275,11 @@ def get_object_xref_id(self, ensembl_id: int, xref_id: int, ensembl_type: str, l
         if status is not None:
             query = query.where(ObjectXrefUORM.ox_status == status)
 
-        result = dbi.execute(query).fetchall()
-
-        if result:
-            object_xref_id = result[0][0]
+        result = dbi.execute(query).scalar()
 
-        return object_xref_id
+        return result
 
-    def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: int = None, status: str = None) -> int:
+    def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, linkage_type: str, dbi: Connection, master_xref_id: Optional[int] = None, status: Optional[str] = None) -> int:
         """Adds data into object xref table in a database.
 
         Parameters
@@ -326,21 +296,21 @@ def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, link
             The xref ID of the xref that this object xref is dependent on
         status: str, optional
             The object xref status
-        dbi: sqlalchemy.engine.Connection
+        dbi: Connection
             The database connection to query in
 
         Returns
         -------
-        The inserted object xref ID.
+        int
+            The inserted object xref ID.
         """
         query = insert(ObjectXrefUORM).values(
             ensembl_id=ensembl_id,
             xref_id=xref_id,
             ensembl_object_type=ensembl_type,
             linkage_type=linkage_type,
+            master_xref_id=master_xref_id
         )
-        if master_xref_id is not None:
-            query = query.values(master_xref_id=master_xref_id)
         if status is not None:
             query = query.values(ox_status=status)
         dbi.execute(query)
@@ -351,30 +321,38 @@ def add_object_xref(self, ensembl_id: int, xref_id: int, ensembl_type: str, link
         return object_xref_id
 
     def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) -> None:
+        """Fixes the biomart issue where a database is associated with both gene and transcript/translation object types.
+
+        Parameters
+        ----------
+        db_name: str
+            The database name
+        type1: str
+            The first object type (gene, transcript, or translation)
+        type2: str
+            The second object type (gene, transcript, or translation)
+        dbi: Connection
+            The database connection to query in
+        """
         logging.info(
             f"{db_name} is associated with both {type1} and {type2} object types. Fixing."
         )
 
-        # Figure out where to move xref to
-        to_type, from_type, to_id, from_id = None, None, None, None
-        if type1 == "Gene" or type2 == "Gene":
+        # Determine the types to move from and to
+        if "Gene" in (type1, type2):
             to_type = "Gene"
-
-            if type1 == "Translation" or type2 == "Translation":
-                from_type = "Translation"
-            else:
-                from_type = "Transcript"
+            from_type = "Translation" if "Translation" in (type1, type2) else "Transcript"
         else:
             to_type = "Transcript"
             from_type = "Translation"
 
         logging.info(f"Moving all associations from {from_type} to {to_type}")
 
-        to_id = getattr(GeneTranscriptTranslationORM, to_type.lower() + "_id")
-        from_id = getattr(GeneTranscriptTranslationORM, from_type.lower() + "_id")
+        to_id = getattr(GeneTranscriptTranslationORM, f"{to_type.lower()}_id")
+        from_id = getattr(GeneTranscriptTranslationORM, f"{from_type.lower()}_id")
 
         # Move the object xref
-        query = (
+        move_query = (
             update(ObjectXrefUORM)
             .values(ensembl_object_type=to_type, ensembl_id=to_id)
             .where(
@@ -387,10 +365,10 @@ def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) ->
             )
             .prefix_with("IGNORE")
         )
-        dbi.execute(query)
+        dbi.execute(move_query)
 
         # Delete moved object xref
-        query = (
+        delete_query = (
             select(ObjectXrefUORM.object_xref_id)
             .outerjoin(
                 IdentityXrefUORM,
@@ -404,7 +382,7 @@ def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) ->
                 SourceUORM.name == db_name,
             )
         )
-        for row in dbi.execute(query).mappings().all():
+        for row in dbi.execute(delete_query).mappings().all():
             dbi.execute(
                 delete(ObjectXrefUORM).where(
                     ObjectXrefUORM.object_xref_id == row.object_xref_id
@@ -418,12 +396,23 @@ def biomart_fix(self, db_name: str, type1: str, type2: str, dbi: Connection) ->
 
         # Delete dependent xref
         sub_query = select(ObjectXrefUORM.object_xref_id)
-        query = delete(DependentXrefUORM).where(
+        dependent_delete_query = delete(DependentXrefUORM).where(
             DependentXrefUORM.object_xref_id.not_in(sub_query)
         )
-        dbi.execute(query)
+        dbi.execute(dependent_delete_query)
 
     def update_object_xref_status(self, object_xref_id: int, status: str, dbi: Connection) -> None:
+        """Updates the status of an object xref.
+
+        Parameters
+        ----------
+        object_xref_id: int
+            The object xref ID to update
+        status: str
+            The new status
+        dbi: Connection
+            The database connection to query in
+        """
         query = (
             update(ObjectXrefUORM)
             .where(ObjectXrefUORM.object_xref_id == object_xref_id)
diff --git a/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py
index 535bb7ad6..08c194dc7 100644
--- a/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/ChecksumMapper.py
@@ -14,8 +14,19 @@
 
 """Mapper module for processing Checksum xref data."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+import logging
+from typing import Any, Dict, List, Optional
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import delete, select
+from sqlalchemy.engine import Connection
 
+from ensembl.xrefs.xref_update_db_model import (
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class ChecksumMapper(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -27,10 +38,9 @@ def __init__(self, mapper: BasicMapper) -> None:
     def target(self) -> None:
         return None
 
-    def mapper(self, mapper: BasicMapper = None):
+    def mapper(self, mapper: Optional[BasicMapper] = None) -> BasicMapper:
         if mapper:
             self._mapper = mapper
-
         return self._mapper
 
     def upload(self, results: List[Dict[str, Any]], species_id: int) -> None:
@@ -46,7 +56,7 @@ def upload(self, results: List[Dict[str, Any]], species_id: int) -> None:
             self._delete_entries("xref", source_id, xref_dbi)
 
         # Start session, in order to get inserted IDs
-        Session = sessionmaker(self.xref())
+        Session = sessionmaker(bind=self.xref().execution_options(isolation_level="READ COMMITTED"))
         with Session.begin() as session:
             logging.info("Starting xref insertion")
 
@@ -54,7 +64,7 @@ def upload(self, results: List[Dict[str, Any]], species_id: int) -> None:
             upi_xref_id = {}
             for row in results:
                 upi = row["upi"]
-                if upi_xref_id.get(upi):
+                if upi in upi_xref_id:
                     row["xref_id"] = upi_xref_id[upi]
                 else:
                     xref_object = XrefUORM(
@@ -81,6 +91,8 @@ def upload(self, results: List[Dict[str, Any]], species_id: int) -> None:
                 )
                 session.add(object_xref_object)
 
+            session.commit()
+
         logging.info("Finished insertions")
 
     def source_id(self) -> int:
diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
index d938d966c..6bf44f8bc 100644
--- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
@@ -14,14 +14,25 @@
 
 """Mapper module for processing coordinate xref data."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-from ensembl.common.RangeRegistry import RangeRegistry
+import subprocess
+import logging
+from datetime import datetime
+from sqlalchemy import select, func, update, insert
+
+from ensembl.core.models import (
+    ObjectXref as ObjectXrefCORM,
+    Xref as XrefCORM,
+    UnmappedObject as UnmappedObjectORM,
+    UnmappedReason as UnmappedReasonORM,
+    Analysis as AnalysisORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 coding_weight = 2
 ens_weight = 3
 transcript_score_threshold = 0.75
 
-
 class CoordinateMapper(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
         self.xref(mapper.xref())
@@ -33,98 +44,89 @@ def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir:
         self.update_process_status("coordinate_xrefs_started")
 
         # We only do coordinate mapping for mouse and human for now
-        if species_name != "mus_musculus" and species_name != "homo_sapiens":
+        if species_name not in ["mus_musculus", "homo_sapiens"]:
             self.update_process_status("coordinate_xref_finished")
             return
 
         output_dir = self.species_dir()
-        xref_filename = os.path.join(output_dir, "xref_coord.txt")
-        object_xref_filename = os.path.join(output_dir, "object_xref_coord.txt")
-        unmapped_reason_filename = os.path.join(output_dir, "unmapped_reason_coord.txt")
-        unmapped_object_filename = os.path.join(output_dir, "unmapped_object_coord.txt")
-
-        xref_dbi = self.xref().connect()
-        core_dbi = self.core().connect()
-
-        # Figure out the last used IDs in the core DB
-        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
-        object_xref_id = core_dbi.execute(
-            select(func.max(ObjectXrefCORM.object_xref_id))
-        ).scalar()
-        unmapped_object_id = core_dbi.execute(
-            select(func.max(UnmappedObjectORM.unmapped_object_id))
-        ).scalar()
-        unmapped_reason_id = core_dbi.execute(
-            select(func.max(UnmappedReasonORM.unmapped_reason_id))
-        ).scalar()
-
-        logging.info(
-            f"Last used xref_id={xref_id}, object_xref_id={object_xref_id}, unmapped_object_id={unmapped_object_id}, unmapped_reason_id={unmapped_reason_id}"
-        )
-
-        # Get an analysis ID
-        analysis_params = f"weights(coding,ensembl)={coding_weight:.2f},{ens_weight:.2f};transcript_score_threshold={transcript_score_threshold:.2f}"
-        analysis_id = core_dbi.execute(
-            select(AnalysisORM.analysis_id).where(
-                AnalysisORM.logic_name == "xrefcoordinatemapping",
-                AnalysisORM.parameters == analysis_params,
+
+        with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi:
+            # Figure out the last used IDs in the core DB
+            xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
+            object_xref_id = core_dbi.execute(select(func.max(ObjectXrefCORM.object_xref_id))).scalar()
+            unmapped_object_id = core_dbi.execute(select(func.max(UnmappedObjectORM.unmapped_object_id))).scalar()
+            unmapped_reason_id = core_dbi.execute(select(func.max(UnmappedReasonORM.unmapped_reason_id))).scalar()
+
+            logging.info(
+                f"Last used xref_id={xref_id}, object_xref_id={object_xref_id}, unmapped_object_id={unmapped_object_id}, unmapped_reason_id={unmapped_reason_id}"
             )
-        ).scalar()
 
-        if not analysis_id:
+            # Get an analysis ID
+            analysis_params = f"weights(coding,ensembl)={coding_weight:.2f},{ens_weight:.2f};transcript_score_threshold={transcript_score_threshold:.2f}"
             analysis_id = core_dbi.execute(
                 select(AnalysisORM.analysis_id).where(
-                    AnalysisORM.logic_name == "xrefcoordinatemapping"
+                    AnalysisORM.logic_name == "xrefcoordinatemapping",
+                    AnalysisORM.parameters == analysis_params,
                 )
             ).scalar()
 
-            if analysis_id:
-                logging.info("Will update 'analysis' table with new parameter settings")
-
-                # Update an existing analysis
-                now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                core_dbi.execute(
-                    update(AnalysisORM)
-                    .where(AnalysisORM.analysis_id == analysis_id)
-                    .values(created=now, parameters=analysis_params)
-                )
-            else:
-                logging.info(
-                    f"Cannot find analysis ID for this analysis: logic_name = 'xrefcoordinatemapping' parameters = {analysis_params}"
-                )
-
-                # Store a new analysis
-                logging.info("A new analysis will be added")
-
+            if not analysis_id:
                 analysis_id = core_dbi.execute(
-                    select(func.max(AnalysisORM.analysis_id))
-                ).scalar()
-                logging.info(f"Last used analysis_id is {analysis_id}")
-
-                analysis_id += 1
-                now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                core_dbi.execute(
-                    insert(AnalysisORM).values(
-                        analysis_id=analysis_id,
-                        created=now,
-                        logic_name="xrefcoordinatemapping",
-                        program="CoordinateMapper.pm",
-                        parameters=analysis_params,
-                        module="CoordinateMapper.pm",
+                    select(AnalysisORM.analysis_id).where(
+                        AnalysisORM.logic_name == "xrefcoordinatemapping"
                     )
-                )
+                ).scalar()
 
-        if analysis_id:
-            logging.info(f"Analysis ID is {analysis_id}")
+                if analysis_id:
+                    logging.info("Will update 'analysis' table with new parameter settings")
 
-        logging.info(f"Running perl script {scripts_dir}/coordinmate_mapper.pl")
-        perl_cmd = f"perl {scripts_dir}/coordinmate_mapper.pl --xref_db_url '{self.xref()}' --core_db_url '{self.core()}' --species_id {species_id} --output_dir '{output_dir}' --analysis_id {analysis_id}"
-        cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE)
+                    # Update an existing analysis
+                    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    core_dbi.execute(
+                        update(AnalysisORM)
+                        .where(AnalysisORM.analysis_id == analysis_id)
+                        .values(created=now, parameters=analysis_params)
+                    )
+                else:
+                    logging.info(
+                        f"Cannot find analysis ID for this analysis: logic_name = 'xrefcoordinatemapping' parameters = {analysis_params}"
+                    )
 
-        self.update_process_status("coordinate_xref_finished")
+                    # Store a new analysis
+                    logging.info("A new analysis will be added")
+
+                    analysis_id = core_dbi.execute(select(func.max(AnalysisORM.analysis_id))).scalar()
+                    logging.info(f"Last used analysis_id is {analysis_id}")
+
+                    analysis_id += 1
+                    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    core_dbi.execute(
+                        insert(AnalysisORM).values(
+                            analysis_id=analysis_id,
+                            created=now,
+                            logic_name="xrefcoordinatemapping",
+                            program="CoordinateMapper.pm",
+                            parameters=analysis_params,
+                            module="CoordinateMapper.pm",
+                        )
+                    )
+
+            if analysis_id:
+                logging.info(f"Analysis ID is {analysis_id}")
+
+            logging.info(f"Running perl script {scripts_dir}/coordinate_mapper.pl")
+            perl_cmd = [
+                "perl",
+                f"{scripts_dir}/coordinate_mapper.pl",
+                "--xref_db_url", str(self.xref()),
+                "--core_db_url", str(self.core()),
+                "--species_id", str(species_id),
+                "--output_dir", output_dir,
+                "--analysis_id", str(analysis_id)
+            ]
+            subprocess.run(perl_cmd, capture_output=True, text=True, check=True)
 
-        self.biomart_fix("UCSC", "Translation", "Gene", xref_dbi)
-        self.biomart_fix("UCSC", "Transcript", "Gene", xref_dbi)
+            self.update_process_status("coordinate_xref_finished")
 
-        xref_dbi.close()
-        core_dbi.close()
+            self.biomart_fix("UCSC", "Translation", "Gene", xref_dbi)
+            self.biomart_fix("UCSC", "Transcript", "Gene", xref_dbi)
diff --git a/src/python/ensembl/production/xrefs/mappers/CoreInfo.py b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py
index eff41f4a2..7ea033aae 100644
--- a/src/python/ensembl/production/xrefs/mappers/CoreInfo.py
+++ b/src/python/ensembl/production/xrefs/mappers/CoreInfo.py
@@ -14,8 +14,34 @@
 
 """Mapper module for loading core data into an xref database."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from typing import Any, Dict, List, Optional
+from sqlalchemy import select, insert, delete
+from sqlalchemy.engine import Connection
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Translation as TranslationORM,
+    Meta as MetaCORM,
+    AltAllele as AltAlleleCORM,
+    t_alt_allele_attrib as AltAlleleAttribORM,
+    ObjectXref as ObjectXrefCORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    SeqRegionAttrib as SeqRegionAttribORM,
+    AttribType as AttribTypeORM
+)
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneTranscriptTranslation as GeneTranscriptTranslationORM,
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    TranslationStableId as TranslationStableIdORM,
+    AltAllele as AltAlleleUORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class CoreInfo(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -33,214 +59,192 @@ def get_core_data(self) -> None:
         self.update_process_status("core_data_loaded")
 
     def load_gene_transcript_translation(self) -> None:
-        xref_dbi = self.xref().connect()
-        core_dbi = self.core().connect()
-
-        query = select(
-            TranscriptORM.gene_id,
-            TranscriptORM.transcript_id,
-            TranslationORM.translation_id,
-        ).outerjoin(
-            TranslationORM, TranscriptORM.transcript_id == TranslationORM.transcript_id
-        )
-        for row in core_dbi.execute(query).mappings().all():
-            xref_dbi.execute(
-                insert(GeneTranscriptTranslationORM)
-                .values(
-                    gene_id=row.gene_id,
-                    transcript_id=row.transcript_id,
-                    translation_id=row.translation_id,
-                )
-                .prefix_with("IGNORE")
+        with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi:
+            query = select(
+                TranscriptORM.gene_id,
+                TranscriptORM.transcript_id,
+                TranslationORM.translation_id,
+            ).outerjoin(
+                TranslationORM, TranscriptORM.transcript_id == TranslationORM.transcript_id
             )
-
-        xref_dbi.close()
-        core_dbi.close()
+            for row in core_dbi.execute(query).mappings().all():
+                xref_dbi.execute(
+                    insert(GeneTranscriptTranslationORM)
+                    .values(
+                        gene_id=row.gene_id,
+                        transcript_id=row.transcript_id,
+                        translation_id=row.translation_id,
+                    )
+                    .prefix_with("IGNORE")
+                )
 
     def load_stable_ids(self) -> None:
-        xref_dbi = self.xref().connect()
-        core_dbi = self.core().connect()
-
-        core_tables = {
-            "gene": GeneORM,
-            "transcript": TranscriptORM,
-            "translation": TranslationORM,
-        }
-        xref_tables = {
-            "gene": GeneStableIdORM,
-            "transcript": TranscriptStableIdORM,
-            "translation": TranslationStableIdORM,
-        }
-
-        for table in ["gene", "transcript", "translation"]:
-            column = getattr(core_tables[table], f"{table}_id")
-            core_query = select(
-                column.label("internal_id"), core_tables[table].stable_id
-            )
-            if table == "transcript":
-                core_query = core_query.add_columns(TranscriptORM.biotype)
-
-            count = 0
-            for row in core_dbi.execute(core_query).mappings().all():
-                xref_query = (
-                    insert(xref_tables[table])
-                    .values(internal_id=row.internal_id, stable_id=row.stable_id)
-                    .prefix_with("IGNORE")
+        with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi:
+            core_tables = {
+                "gene": GeneORM,
+                "transcript": TranscriptORM,
+                "translation": TranslationORM,
+            }
+            xref_tables = {
+                "gene": GeneStableIdORM,
+                "transcript": TranscriptStableIdORM,
+                "translation": TranslationStableIdORM,
+            }
+
+            for table in ["gene", "transcript", "translation"]:
+                column = getattr(core_tables[table], f"{table}_id")
+                core_query = select(
+                    column.label("internal_id"), core_tables[table].stable_id
                 )
                 if table == "transcript":
-                    xref_query = xref_query.values(biotype=row.biotype)
-                xref_dbi.execute(xref_query)
-
-                count += 1
+                    core_query = core_query.add_columns(TranscriptORM.biotype)
+
+                count = 0
+                for row in core_dbi.execute(core_query).mappings().all():
+                    xref_query = (
+                        insert(xref_tables[table])
+                        .values(internal_id=row.internal_id, stable_id=row.stable_id)
+                        .prefix_with("IGNORE")
+                    )
+                    if table == "transcript":
+                        xref_query = xref_query.values(biotype=row.biotype)
+                    xref_dbi.execute(xref_query)
 
-            logging.info(f"{count} {table}s loaded from core DB")
+                    count += 1
 
-        xref_dbi.close()
-        core_dbi.close()
+                logging.info(f"{count} {table}s loaded from core DB")
 
     def get_alt_alleles(self) -> None:
-        xref_dbi = self.xref().connect()
-        core_dbi = self.core().connect()
-
-        alt_allele_list = self.fetch_all_alt_alleles(core_dbi)
-
-        count = len(alt_allele_list)
-        alt_id_to_gene_id, gene_id_to_alt_id, is_reference = {}, {}, {}
-        max_alt_id = 0
-
-        if count > 0:
-            xref_dbi.execute(delete(AltAlleleUORM))
-
-            alt_added, num_of_genes = 0, 0
-
-            # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table
-            # Track the reference gene IDs
-            for group_id, group_members in alt_allele_list.items():
-                ref_gene = self.rep_gene_id(group_members)
-
-                # Representative gene not guaranteed, try to find an alternative best fit
-                if not ref_gene:
-                    logging.info("Get alternative reference gene")
-                    for gene_id in self.get_all_genes(group_members):
-                        query = select(AttribTypeORM.code).where(
-                            SeqRegionAttribORM.seq_region_id == GeneORM.seq_region_id,
-                            AttribTypeORM.attrib_type_id
-                            == SeqRegionAttribORM.attrib_type_id,
-                            GeneORM.gene_id == gene_id,
-                            AttribTypeORM.code == "non_ref",
+        with self.xref().connect() as xref_dbi, self.core().connect() as core_dbi:
+            alt_allele_list = self.fetch_all_alt_alleles(core_dbi)
+
+            count = len(alt_allele_list)
+            max_alt_id = 0
+
+            if count > 0:
+                xref_dbi.execute(delete(AltAlleleUORM))
+
+                alt_added, num_of_genes = 0, 0
+
+                # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table
+                # Track the reference gene IDs
+                for group_id, group_members in alt_allele_list.items():
+                    ref_gene = self.rep_gene_id(group_members)
+
+                    # Representative gene not guaranteed, try to find an alternative best fit
+                    if not ref_gene:
+                        logging.info("Get alternative reference gene")
+                        for gene_id in self.get_all_genes(group_members):
+                            query = select(AttribTypeORM.code).where(
+                                SeqRegionAttribORM.seq_region_id == GeneORM.seq_region_id,
+                                AttribTypeORM.attrib_type_id
+                                == SeqRegionAttribORM.attrib_type_id,
+                                GeneORM.gene_id == gene_id,
+                                AttribTypeORM.code == "non_ref",
+                            )
+                            result = core_dbi.execute(query)
+                            if result.rowcount > 0:
+                                continue
+                            else:
+                                ref_gene = gene_id
+                                break
+
+                    if not ref_gene:
+                        logging.warning(
+                            f"Tried very hard but failed to select a representative gene for alt-allele-group {group_id}"
                         )
-                        result = core_dbi.execute(query)
-                        if result.rowcount > 0:
-                            continue
-                        else:
-                            ref_gene = gene_id
-                            break
-
-                if not ref_gene:
-                    logging.warning(
-                        f"Tried very hard but failed to select a representative gene for alt-allele-group {group_id}"
-                    )
-                    continue
+                        continue
 
-                is_reference[ref_gene] = 1
-                others = []
-                for member in group_members:
-                    if member[0] != ref_gene:
-                        others.append(member[0])
+                    others = [member[0] for member in group_members if member[0] != ref_gene]
 
-                xref_dbi.execute(
-                    insert(AltAlleleUORM).values(
-                        alt_allele_id=group_id, gene_id=ref_gene, is_reference=1
-                    )
-                )
-                num_of_genes += 1
-                alt_added += 1
-                for gene_id in others:
                     xref_dbi.execute(
                         insert(AltAlleleUORM).values(
-                            alt_allele_id=group_id, gene_id=gene_id, is_reference=0
+                            alt_allele_id=group_id, gene_id=ref_gene, is_reference=1
                         )
                     )
                     num_of_genes += 1
+                    alt_added += 1
+                    for gene_id in others:
+                        xref_dbi.execute(
+                            insert(AltAlleleUORM).values(
+                                alt_allele_id=group_id, gene_id=gene_id, is_reference=0
+                            )
+                        )
+                        num_of_genes += 1
 
-                if group_id > max_alt_id:
-                    max_alt_id = group_id
-
-            logging.info(f"{alt_added} alleles found containing {num_of_genes} genes")
-        else:
-            logging.info("No alt alleles found for this species")
+                    if group_id > max_alt_id:
+                        max_alt_id = group_id
 
-        # LRGs added as alt_alleles in the XREF system but never added to core
-        count = 0
-        old_count, new_count, lrg_count = 0, 0, 0
+                logging.info(f"{alt_added} alleles found containing {num_of_genes} genes")
+            else:
+                logging.info("No alt alleles found for this species")
 
-        query = (
-            select(ObjectXrefCORM.ensembl_id, GeneORM.gene_id)
-            .where(
-                XrefCORM.xref_id == ObjectXrefCORM.xref_id,
-                ExternalDbORM.external_db_id == XrefCORM.external_db_id,
-                ObjectXrefCORM.ensembl_object_type == "Gene",
-                XrefCORM.display_label == GeneORM.stable_id,
-            )
-            .filter(ExternalDbORM.db_name.like("Ens_Hs_gene"))
-        )
-        for row in core_dbi.execute(query).mappings().all():
-            # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only
-            # Else use a new one and add both core and LRG
-            group_id = self.fetch_group_id_by_gene_id(row.gene_id, core_dbi)
-            if group_id:
-                xref_dbi.execute(
-                    insert(AltAlleleUORM).values(
-                        alt_allele_id=group_id, gene_id=row.ensembl_id, is_reference=0
-                    )
+            # LRGs added as alt_alleles in the XREF system but never added to core
+            count = 0
+            old_count, new_count, lrg_count = 0, 0, 0
+
+            query = (
+                select(ObjectXrefCORM.ensembl_id, GeneORM.gene_id)
+                .where(
+                    XrefCORM.xref_id == ObjectXrefCORM.xref_id,
+                    ExternalDbORM.external_db_id == XrefCORM.external_db_id,
+                    ObjectXrefCORM.ensembl_object_type == "Gene",
+                    XrefCORM.display_label == GeneORM.stable_id,
                 )
-                old_count += 1
-            else:
-                group_id = self.fetch_group_id_by_gene_id(row.ensembl_id, core_dbi)
+                .filter(ExternalDbORM.db_name.like("Ens_Hs_gene"))
+            )
+            for row in core_dbi.execute(query).mappings().all():
+                # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only
+                # Else use a new one and add both core and LRG
+                group_id = self.fetch_group_id_by_gene_id(row.gene_id, core_dbi)
                 if group_id:
                     xref_dbi.execute(
                         insert(AltAlleleUORM).values(
-                            alt_allele_id=group_id,
-                            gene_id=row.ensembl_id,
-                            is_reference=1,
+                            alt_allele_id=group_id, gene_id=row.ensembl_id, is_reference=0
                         )
                     )
-                    lrg_count += 1
-                    logging.info(f"LRG peculiarity\t{row.gene_id}\t{row.ensembl_id}")
+                    old_count += 1
                 else:
-                    max_alt_id += 1
-                    xref_dbi.execute(
-                        insert(AltAlleleUORM).values(
-                            alt_allele_id=max_alt_id,
-                            gene_id=row.ensembl_id,
-                            is_reference=0,
+                    group_id = self.fetch_group_id_by_gene_id(row.ensembl_id, core_dbi)
+                    if group_id:
+                        xref_dbi.execute(
+                            insert(AltAlleleUORM).values(
+                                alt_allele_id=group_id,
+                                gene_id=row.ensembl_id,
+                                is_reference=1,
+                            )
                         )
-                    )
-                    xref_dbi.execute(
-                        insert(AltAlleleUORM).values(
-                            alt_allele_id=max_alt_id,
-                            gene_id=row.gene_id,
-                            is_reference=1,
+                        lrg_count += 1
+                        logging.info(f"LRG peculiarity\t{row.gene_id}\t{row.ensembl_id}")
+                    else:
+                        max_alt_id += 1
+                        xref_dbi.execute(
+                            insert(AltAlleleUORM).values(
+                                alt_allele_id=max_alt_id,
+                                gene_id=row.ensembl_id,
+                                is_reference=0,
+                            )
                         )
-                    )
-                    new_count += 1
-            count += 1
-
-        if count:
-            logging.info(
-                f"Added {count} alt_alleles for the LRGs. {old_count} added to previous alt_alleles and {new_count} new ones"
-            )
-            logging.info(f"LRG problem count = {lrg_count}")
+                        xref_dbi.execute(
+                            insert(AltAlleleUORM).values(
+                                alt_allele_id=max_alt_id,
+                                gene_id=row.gene_id,
+                                is_reference=1,
+                            )
+                        )
+                        new_count += 1
+                count += 1
 
-        xref_dbi.close()
-        core_dbi.close()
+            if count:
+                logging.info(
+                    f"Added {count} alt_alleles for the LRGs. {old_count} added to previous alt_alleles and {new_count} new ones"
+                )
+                logging.info(f"LRG problem count = {lrg_count}")
 
-        self.update_process_status("alt_alleles_added")
+            self.update_process_status("alt_alleles_added")
 
     def fetch_all_alt_alleles(self, dbi: Connection) -> Dict[int, List[List[Any]]]:
         group_list = {}
-        query = None
-
         if self.is_multispecies(dbi):  ##### TO DO: handle multiespecies
             raise NotImplementedError(f"Pipeline cannot handle multispecies DBs yet")
 
@@ -281,7 +285,7 @@ def fetch_group_id_by_gene_id(self, gene_id: int, dbi: Connection) -> Optional[i
         )
         group_list = dbi.execute(query).mappings().all()
 
-        if len(group_list) > 0:
+        if group_list:
             return group_list[0].alt_allele_group_id
 
         return None
@@ -293,10 +297,7 @@ def is_multispecies(self, dbi: Connection) -> bool:
             )
         )
 
-        if result.rowcount > 1:
-            return True
-        else:
-            return False
+        return result.rowcount > 1
 
     def rep_gene_id(self, group: List[List[Any]]) -> Optional[int]:
         for allele in group:
@@ -312,9 +313,4 @@ def rep_gene_id(self, group: List[List[Any]]) -> Optional[int]:
         return None
 
     def get_all_genes(self, group: List[List[Any]]) -> List[int]:
-        gene_ids = []
-
-        for allele in group:
-            gene_ids.append(allele[0])
-
-        return sorted(gene_ids)
+        return sorted(allele[0] for allele in group)
diff --git a/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py
index c3113dee3..a05028aca 100644
--- a/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/DirectXrefsMapper.py
@@ -14,8 +14,26 @@
 
 """Mapper module for processing direct xref data."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+import re
+from typing import Any, Dict
+from sqlalchemy import select, insert
+from sqlalchemy.engine import Connection
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    TranslationStableId as TranslationStableIdORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    GeneDirectXref as GeneDirectXrefORM,
+    TranscriptDirectXref as TranscriptDirectXrefORM,
+    TranslationDirectXref as TranslationDirectXrefORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class DirectXrefsMapper(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -41,9 +59,9 @@ def process(self) -> None:
         err_count = {}
         object_xref_id = 0
 
-        for table in ["gene", "transcript", "translation"]:
-            direct_table = db_tables[table]["direct"]
-            stable_id_table = db_tables[table]["stable_id"]
+        for object_type, tables in db_tables.items():
+            direct_table = tables["direct"]
+            stable_id_table = tables["stable_id"]
 
             count, duplicate_direct_count, duplicate_dependent_count = 0, 0, 0
 
@@ -89,14 +107,14 @@ def process(self) -> None:
 
                 # Insert into object xref table
                 object_xref_id = self.get_object_xref_id(
-                    internal_id, xref_id, table, "DIRECT", xref_dbi
+                    internal_id, xref_id, object_type, "DIRECT", xref_dbi
                 )
                 if object_xref_id:
                     duplicate_direct_count += 1
                     continue
                 else:
                     object_xref_id = self.add_object_xref(
-                        internal_id, xref_id, table, "DIRECT", xref_dbi
+                        internal_id, xref_id, object_type, "DIRECT", xref_dbi
                     )
 
                 # Insert into identity xref table
@@ -113,7 +131,7 @@ def process(self) -> None:
                     {
                         "master_xrefs": master_xref_ids,
                         "dup_count": duplicate_dependent_count,
-                        "table": table,
+                        "table": object_type,
                         "internal_id": internal_id,
                     },
                     xref_dbi,
@@ -121,7 +139,7 @@ def process(self) -> None:
 
             if duplicate_direct_count or duplicate_dependent_count:
                 logging.info(
-                    f"Duplicate entries ignored for {duplicate_direct_count} direct xrefs and  {duplicate_dependent_count} dependent xrefs"
+                    f"Duplicate entries ignored for {duplicate_direct_count} direct xrefs and {duplicate_dependent_count} dependent xrefs"
                 )
 
         for key, val in err_count.items():
diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
index a2a543589..22b6f61b7 100644
--- a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
+++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
@@ -14,8 +14,39 @@
 
 """Mapper module for setting display xrefs in the core DB."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+import re
+from datetime import datetime
+from typing import Dict, List, Tuple
+from sqlalchemy import select, insert, update, delete, case, desc, func, aliased
+from sqlalchemy.engine import Connection
+from sqlalchemy.sql import Select
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Translation as TranslationORM,
+    Meta as MetaCORM,
+    ObjectXref as ObjectXrefCORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    ExternalSynonym as ExternalSynonymORM
+)
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneTranscriptTranslation as GeneTranscriptTranslationORM,
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    DisplayXrefPriority as DisplayXrefPriorityORM,
+    GeneDescPriority as GeneDescPriorityORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class DisplayXrefs(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -36,16 +67,14 @@ def build_display_xrefs(self) -> None:
         mapper = self.mapper()
 
         # Set the display xrefs
+        set_transcript_display_xrefs = hasattr(mapper, "set_transcript_names")
         if hasattr(mapper, "set_display_xrefs"):
             mapper.set_display_xrefs()
         else:
-            set_transcript_display_xrefs = False
-            if hasattr(mapper, "set_transcript_names"):
-                set_transcript_display_xrefs = True
             self.set_display_xrefs(set_transcript_display_xrefs)
 
         # Set transcript names
-        if hasattr(mapper, "set_transcript_names"):
+        if set_transcript_display_xrefs:
             mapper.set_transcript_names()
         else:
             self.set_transcript_names()
@@ -64,47 +93,33 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
         logging.info("Setting Transcript and Gene display xrefs")
 
         # Get the xref offset used when adding the xrefs into the core DB
-        xref_offset = self.get_meta_value("xref_offset")
-        xref_offset = int(xref_offset)
+        xref_offset = int(self.get_meta_value("xref_offset"))
         logging.info(f"Using xref offset of {xref_offset}")
 
         xref_dbi = self.xref().connect()
         core_dbi = self.core().connect()
         mapper = self.mapper()
 
-        # Reset transcript display xrefs
+        # Reset transcript display xrefs if required
         if set_transcript_display_xrefs:
             core_dbi.execute(
                 update(TranscriptORM)
                 .values(display_xref_id=None)
-                .where(TranslationORM.biotype != "LRG_gene")
+                .where(TranscriptORM.biotype != "LRG_gene")
             )
 
         for object_type in ["Gene", "Transcript"]:
             if object_type == "Transcript" and not set_transcript_display_xrefs:
                 continue
-            precedence_list, ignore = None, None
 
             # Get name source priorities and ignore queries
             method = f"{object_type.lower()}_display_xref_sources"
-            if hasattr(mapper, method):
-                precedence_list, ignore = getattr(mapper, method)()
-            else:
-                precedence_list, ignore = getattr(self, method)()
+            precedence_list, ignore = getattr(mapper, method)() if hasattr(mapper, method) else getattr(self, method)()
 
             # Add the priorities into the DB
-            priority = 0
             logging.info(f"Precedence for {object_type} display xrefs (1- best name)")
-
-            for source_name in precedence_list:
-                priority += 1
-
-                # Get the source ID
-                query = (
-                    select(SourceUORM.source_id, SourceUORM.name)
-                    .where(SourceUORM.name.like(source_name))
-                    .order_by(SourceUORM.priority)
-                )
+            for priority, source_name in enumerate(precedence_list, start=1):
+                query = select(SourceUORM.source_id, SourceUORM.name).where(SourceUORM.name.like(source_name))
                 for row in xref_dbi.execute(query).mappings().all():
                     xref_dbi.execute(
                         insert(DisplayXrefPriorityORM).values(
@@ -113,8 +128,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
                             priority=priority,
                         )
                     )
-
-                logging.info(f"{priority} - {row.name}")
+                    logging.info(f"{priority} - {row.name}")
 
             # Execute ignore queries
             self._apply_ignore(ignore, xref_dbi)
@@ -129,34 +143,19 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
             gene_case_stmt = case(
                 [
                     (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id),
-                    (
-                        ObjectXrefUORM.ensembl_object_type == "Transcript",
-                        GTTTranscript.gene_id,
-                    ),
-                    (
-                        ObjectXrefUORM.ensembl_object_type == "Translation",
-                        GTTTranslation.gene_id,
-                    ),
+                    (ObjectXrefUORM.ensembl_object_type == "Transcript", GTTTranscript.gene_id),
+                    (ObjectXrefUORM.ensembl_object_type == "Translation", GTTTranslation.gene_id),
                 ],
             ).label("d_gene_id")
             transcript_case_stmt = case(
                 [
-                    (
-                        ObjectXrefUORM.ensembl_object_type == "Gene",
-                        GTTGene.transcript_id,
-                    ),
-                    (
-                        ObjectXrefUORM.ensembl_object_type == "Transcript",
-                        GTTTranscript.transcript_id,
-                    ),
-                    (
-                        ObjectXrefUORM.ensembl_object_type == "Translation",
-                        GTTTranslation.transcript_id,
-                    ),
+                    (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.transcript_id),
+                    (ObjectXrefUORM.ensembl_object_type == "Transcript", GTTTranscript.transcript_id),
+                    (ObjectXrefUORM.ensembl_object_type == "Translation", GTTTranslation.transcript_id),
                 ],
             ).label("d_transcript_id")
 
-            # Get all relevent xrefs for this object type based on precendence sources
+            # Get all relevant xrefs for this object type based on precedence sources
             query = (
                 select(
                     gene_case_stmt,
@@ -164,24 +163,13 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
                     DisplayXrefPriorityORM.priority,
                     XrefUORM.xref_id,
                 )
-                .join(
-                    SourceUORM, SourceUORM.source_id == DisplayXrefPriorityORM.source_id
-                )
+                .join(SourceUORM, SourceUORM.source_id == DisplayXrefPriorityORM.source_id)
                 .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
                 .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id)
-                .join(
-                    IdentityXrefUORM,
-                    IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
-                )
+                .join(IdentityXrefUORM, IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id)
                 .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id)
-                .outerjoin(
-                    GTTTranscript,
-                    GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id,
-                )
-                .outerjoin(
-                    GTTTranslation,
-                    GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id,
-                )
+                .outerjoin(GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id)
+                .outerjoin(GTTTranslation, GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id)
                 .where(
                     ObjectXrefUORM.ox_status == "DUMP_OUT",
                     DisplayXrefPriorityORM.ensembl_object_type == object_type,
@@ -190,32 +178,22 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
                     "d_gene_id",
                     ObjectXrefUORM.ensembl_object_type,
                     DisplayXrefPriorityORM.priority,
-                    desc(
-                        IdentityXrefUORM.target_identity
-                        + IdentityXrefUORM.query_identity
-                    ),
+                    desc(IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity),
                     ObjectXrefUORM.unused_priority.desc(),
                     XrefUORM.accession,
                 )
             )
             for row in xref_dbi.execute(query).mappings().all():
-                object_id = None
-                if object_type == "Gene":
-                    object_id = row.d_gene_id
-                elif object_type == "Transcript":
-                    object_id = row.d_transcript_id
+                object_id = row.d_gene_id if object_type == "Gene" else row.d_transcript_id
 
                 # Update the display xrefs
-                if not object_seen.get(object_id):
+                if object_id not in object_seen:
                     xref_id = int(row.xref_id)
                     if object_type == "Gene":
                         core_dbi.execute(
                             update(GeneORM)
                             .values(display_xref_id=xref_id + xref_offset)
-                            .where(
-                                GeneORM.gene_id == object_id,
-                                GeneORM.display_xref_id == None,
-                            )
+                            .where(GeneORM.gene_id == object_id, GeneORM.display_xref_id == None)
                         )
                     elif object_type == "Transcript":
                         core_dbi.execute(
@@ -225,7 +203,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
                         )
 
                     display_xref_count += 1
-                    object_seen[object_id] = 1
+                    object_seen[object_id] = True
 
             logging.info(f"Updated {display_xref_count} {object_type} display_xrefs")
 
@@ -242,8 +220,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
             .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id)
             .where(GeneORM.display_xref_id == None)
         )
-        result = core_dbi.execute(query).fetchall()
-        xref_ids = [row[0] for row in result]
+        xref_ids = [row[0] for row in core_dbi.execute(query).fetchall()]
 
         core_dbi.execute(
             delete(ExternalSynonymORM).where(ExternalSynonymORM.xref_id.in_(xref_ids))
@@ -286,6 +263,7 @@ def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
         )
         ignore_queries["EntrezGene"] = query
 
+        # Ignore LOC-prefixed labels
         query = (
             select(ObjectXrefUORM.object_xref_id)
             .join(XrefUORM, XrefUORM.xref_id == ObjectXrefUORM.xref_id)
@@ -304,22 +282,23 @@ def transcript_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]
 
     def _apply_ignore(self, ignore_queries: Dict[str, Select], dbi: Connection) -> None:
         # Set status to NO_DISPLAY for object_xrefs with a display_label that is just numeric
-        query = (
+        numeric_label_query = (
             update(ObjectXrefUORM)
             .values(ox_status="NO_DISPLAY")
             .where(
                 ObjectXrefUORM.xref_id == XrefUORM.xref_id,
                 XrefUORM.source_id == SourceUORM.source_id,
-                ObjectXrefUORM.ox_status.like("DUMP_OUT"),
+                ObjectXrefUORM.ox_status == "DUMP_OUT",
                 XrefUORM.label.regexp_match("^[0-9]+$"),
             )
         )
-        dbi.execute(query)
+        dbi.execute(numeric_label_query)
 
         # Go through ignore queries
         for ignore_type, ignore_query in ignore_queries.items():
             # Set status to NO_DISPLAY for ignore results
-            for row in dbi.execute(ignore_query).mappings().all():
+            ignore_results = dbi.execute(ignore_query).mappings().all()
+            for row in ignore_results:
                 dbi.execute(
                     update(ObjectXrefUORM)
                     .values(ox_status="NO_DISPLAY")
@@ -339,12 +318,8 @@ def set_transcript_names(self) -> None:
         )
 
         # Get the max xref and object_xref IDs
-        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
-        xref_id = int(xref_id)
-        object_xref_id = core_dbi.execute(
-            select(func.max(ObjectXrefCORM.object_xref_id))
-        ).scalar()
-        object_xref_id = int(object_xref_id)
+        xref_id = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() or 0
+        object_xref_id = core_dbi.execute(select(func.max(ObjectXrefCORM.object_xref_id))).scalar() or 0
 
         # Get all genes with set display_xref_id
         query = select(
@@ -373,12 +348,12 @@ def set_transcript_names(self) -> None:
                 )
 
             # Get transcripts related to current gene
-            query = (
+            transcript_query = (
                 select(TranscriptORM.transcript_id)
                 .where(TranscriptORM.gene_id == row.gene_id)
                 .order_by(TranscriptORM.seq_region_start, TranscriptORM.seq_region_end)
             )
-            for transcript_row in core_dbi.execute(query).mappings().all():
+            for transcript_row in core_dbi.execute(transcript_query).mappings().all():
                 object_xref_id += 1
 
                 display_label = f"{row.display_label}-{ext}"
@@ -424,7 +399,7 @@ def set_transcript_names(self) -> None:
                     )
                 )
 
-                # Set transcript dispay xref
+                # Set transcript display xref
                 core_dbi.execute(
                     update(TranscriptORM)
                     .values(display_xref_id=insert_xref_id)
@@ -434,13 +409,12 @@ def set_transcript_names(self) -> None:
                 ext += 1
 
         # Delete object xrefs with no matching xref
-        query = (
+        delete_query = (
             select(ObjectXrefCORM.object_xref_id)
             .outerjoin(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id)
             .where(XrefCORM.xref_id == None)
         )
-        result = core_dbi.execute(query).fetchall()
-        object_xref_ids = [row[0] for row in result]
+        object_xref_ids = [row[0] for row in core_dbi.execute(delete_query).fetchall()]
 
         core_dbi.execute(
             delete(ObjectXrefCORM).where(
@@ -460,61 +434,41 @@ def set_gene_descriptions(self) -> None:
         # Reset the gene descriptions
         core_dbi.execute(update(GeneORM).values(description=None))
 
-        # Get external display names
-        name_to_external_name = {}
-        query = select(
-            ExternalDbORM.external_db_id,
-            ExternalDbORM.db_name,
-            ExternalDbORM.db_display_name,
-        )
-        for row in core_dbi.execute(query).mappings().all():
-            name_to_external_name[row.db_name] = row.db_display_name
-
         # Get source ID to external names mappings
-        if hasattr(mapper, "set_source_id_to_external_name"):
-            source_id_to_external_name, name_to_source_id = (
-                mapper.set_source_id_to_external_name(name_to_external_name, xref_dbi)
-            )
-        else:
-            source_id_to_external_name, name_to_source_id = (
-                self.set_source_id_to_external_name(name_to_external_name, xref_dbi)
-            )
+        source_id_to_external_name, name_to_source_id = self.get_external_name_mappings(core_dbi, xref_dbi)
 
         # Get description source priorities and ignore queries
-        if hasattr(mapper, "gene_description_sources"):
-            precedence_list = mapper.gene_description_sources()
-            ignore = None
-        else:
-            precedence_list, ignore = self.gene_description_sources()
+        precedence_list, ignore = (
+            mapper.gene_description_sources()
+            if hasattr(mapper, "gene_description_sources")
+            else self.gene_description_sources()
+        )
 
         # Get description regular expressions
-        if hasattr(mapper, "gene_description_filter_regexps"):
-            reg_exps = mapper.gene_description_filter_regexps()
-        else:
-            reg_exps = self.gene_description_filter_regexps()
+        reg_exps = (
+            mapper.gene_description_filter_regexps()
+            if hasattr(mapper, "gene_description_filter_regexps")
+            else self.gene_description_filter_regexps()
+        )
 
         # Add the description priorities into the DB
-        priority = 0
         logging.info("Precedence for Gene descriptions (1- best description)")
-
-        for source_name in precedence_list:
-            priority += 1
-
-            # Get the source ID
-            query = select(SourceUORM.source_id, SourceUORM.name).where(
-                SourceUORM.name.like(source_name)
-            )
-            for row in xref_dbi.execute(query).mappings().all():
+        for priority, source_name in enumerate(precedence_list, start=1):
+            for row in xref_dbi.execute(
+                select(SourceUORM.source_id, SourceUORM.name).where(
+                    SourceUORM.name.like(source_name)
+                )
+            ).mappings().all():
                 xref_dbi.execute(
                     insert(GeneDescPriorityORM)
                     .values(source_id=row.source_id, priority=priority)
                     .prefix_with("IGNORE")
                 )
-
-            logging.info(f"{priority} - {row.name}")
+                logging.info(f"{priority} - {row.name}")
 
         # Execute ignore queries
-        self._apply_ignore(ignore, xref_dbi)
+        if ignore:
+            self._apply_ignore(ignore, xref_dbi)
 
         no_source_name_in_desc = {}
         if hasattr(mapper, "no_source_label_list"):
@@ -522,9 +476,9 @@ def set_gene_descriptions(self) -> None:
                 source_id = name_to_source_id.get(source_name)
                 if source_id:
                     logging.info(
-                        f"Source '{name}' will not have [Source:...] info in description"
+                        f"Source '{source_name}' will not have [Source:...] info in description"
                     )
-                    no_source_name_in_desc[source_id] = 1
+                    no_source_name_in_desc[source_id] = True
 
         gene_desc_updated = {}
 
@@ -535,18 +489,12 @@ def set_gene_descriptions(self) -> None:
         gene_case_stmt = case(
             [
                 (ObjectXrefUORM.ensembl_object_type == "Gene", GTTGene.gene_id),
-                (
-                    ObjectXrefUORM.ensembl_object_type == "Transcript",
-                    GTTTranscript.gene_id,
-                ),
-                (
-                    ObjectXrefUORM.ensembl_object_type == "Translation",
-                    GTTTranslation.gene_id,
-                ),
+                (ObjectXrefUORM.ensembl_object_type == "Transcript", GTTTranscript.gene_id),
+                (ObjectXrefUORM.ensembl_object_type == "Translation", GTTTranslation.gene_id),
             ],
         ).label("d_gene_id")
 
-        # Get all relevent xrefs for this object type based on precendence sources
+        # Get all relevant xrefs for this object type based on precedence sources
         query = (
             select(
                 gene_case_stmt,
@@ -558,53 +506,40 @@ def set_gene_descriptions(self) -> None:
             .join(SourceUORM, SourceUORM.source_id == GeneDescPriorityORM.source_id)
             .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
             .join(ObjectXrefUORM, ObjectXrefUORM.xref_id == XrefUORM.xref_id)
-            .join(
-                IdentityXrefUORM,
-                IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
-            )
+            .join(IdentityXrefUORM, IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id)
             .outerjoin(GTTGene, GTTGene.gene_id == ObjectXrefUORM.ensembl_id)
-            .outerjoin(
-                GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id
-            )
-            .outerjoin(
-                GTTTranslation,
-                GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id,
-            )
+            .outerjoin(GTTTranscript, GTTTranscript.transcript_id == ObjectXrefUORM.ensembl_id)
+            .outerjoin(GTTTranslation, GTTTranslation.translation_id == ObjectXrefUORM.ensembl_id)
             .where(ObjectXrefUORM.ox_status == "DUMP_OUT")
             .order_by(
                 "d_gene_id",
                 ObjectXrefUORM.ensembl_object_type,
                 GeneDescPriorityORM.priority,
-                desc(
-                    IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity
-                ),
+                desc(IdentityXrefUORM.target_identity + IdentityXrefUORM.query_identity),
             )
         )
         for row in xref_dbi.execute(query).mappings().all():
-            if gene_desc_updated.get(row.d_gene_id):
+            if row.d_gene_id in gene_desc_updated:
                 continue
 
             if row.description:
                 # Apply regular expressions to description
                 filtered_description = self.filter_by_regexp(row.description, reg_exps)
-                if filtered_description != "":
-                    source_name = source_id_to_external_name.get(row.source_id)
-                    filtered_description += (
-                        f" [Source:{source_name};Acc:{row.accession}]"
-                    )
+                if filtered_description:
+                    if row.source_id not in no_source_name_in_desc:
+                        source_name = source_id_to_external_name.get(row.source_id)
+                        filtered_description += f" [Source:{source_name};Acc:{row.accession}]"
 
-                # Update the gene description
-                core_dbi.execute(
-                    update(GeneORM)
-                    .values(description=filtered_description)
-                    .where(
-                        GeneORM.gene_id == row.d_gene_id, GeneORM.description == None
+                    # Update the gene description
+                    core_dbi.execute(
+                        update(GeneORM)
+                        .values(description=filtered_description)
+                        .where(GeneORM.gene_id == row.d_gene_id, GeneORM.description == None)
                     )
-                )
 
-                gene_desc_updated[row.d_gene_id] = 1
+                    gene_desc_updated[row.d_gene_id] = True
 
-        logging.info(f"{len(gene_desc_updated.keys())} gene descriptions added")
+        logging.info(f"{len(gene_desc_updated)} gene descriptions added")
 
         # Reset ignored object xrefs
         xref_dbi.execute(
@@ -618,14 +553,16 @@ def set_gene_descriptions(self) -> None:
 
     def get_external_name_mappings(self, core_dbi: Connection, xref_dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]:
         # Get external display names
-        external_name_to_display_name = {}
-        query = select(
-            ExternalDbORM.external_db_id,
-            ExternalDbORM.db_name,
-            ExternalDbORM.db_display_name,
-        )
-        for row in core_dbi.execute(query).mappings().all():
-            external_name_to_display_name[row.db_name] = row.db_display_name
+        external_name_to_display_name = {
+            row.db_name: row.db_display_name
+            for row in core_dbi.execute(
+                select(
+                    ExternalDbORM.external_db_id,
+                    ExternalDbORM.db_name,
+                    ExternalDbORM.db_display_name,
+                )
+            ).mappings().all()
+        }
 
         # Get sources for available xrefs
         source_id_to_external_name, source_name_to_source_id = {}, {}
@@ -645,26 +582,6 @@ def get_external_name_mappings(self, core_dbi: Connection, xref_dbi: Connection)
 
         return source_id_to_external_name, source_name_to_source_id
 
-    def set_source_id_to_external_name(self, name_to_external_name: Dict[str, str], dbi: Connection) -> Tuple[Dict[int, str], Dict[str, int]]:
-        source_id_to_external_name, name_to_source_id = {}, {}
-
-        # Get sources for available xrefs
-        query = (
-            select(SourceUORM.source_id, SourceUORM.name)
-            .where(SourceUORM.source_id == XrefUORM.source_id)
-            .group_by(SourceUORM.source_id)
-        )
-        for row in dbi.execute(query).mappings().all():
-            if name_to_external_name.get(row.name):
-                source_id_to_external_name[row.source_id] = name_to_external_name[row.name]
-                name_to_source_id[row.name] = row.source_id
-            elif re.search(r"notransfer$", row.name):
-                logging.info(f"Ignoring notransfer source '{row.name}'")
-            else:
-                raise LookupError(f"Could not find {row.name} in external_db table")
-
-        return source_id_to_external_name, name_to_source_id
-
     def gene_description_sources(self) -> Tuple[List[str], Dict[str, Select]]:
         return self.gene_display_xref_sources()
 
@@ -746,20 +663,25 @@ def filter_by_regexp(self, string: str, regular_expressions: List[str]) -> str:
         return string
 
     def set_meta_timestamp(self) -> None:
+        logging.info("Setting meta timestamp for xrefs")
+        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
         with self.core().connect() as dbi:
+            # Delete existing xref timestamp
             dbi.execute(delete(MetaCORM).where(MetaCORM.meta_key == "xref.timestamp"))
 
-            now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            # Insert new xref timestamp
             dbi.execute(
                 insert(MetaCORM).values(meta_key="xref.timestamp", meta_value=now)
             )
 
+        logging.info(f"Meta timestamp set to {now}")
+
     def set_display_xrefs_from_stable_table(self) -> None:
         logging.info("Setting Transcript and Gene display xrefs using stable IDs")
 
         # Get the xref offset used when adding the xrefs into the core DB
-        xref_offset = self.get_meta_value("xref_offset")
-        xref_offset = int(xref_offset)
+        xref_offset = int(self.get_meta_value("xref_offset"))
         logging.info(f"Using xref offset of {xref_offset}")
 
         xref_dbi = self.xref().connect()
@@ -776,26 +698,8 @@ def set_display_xrefs_from_stable_table(self) -> None:
             .where(GeneORM.description.like("%[Source:%]%"))
         )
 
-        # Get external names and IDs
-        name_to_external_name, source_id_to_external_name = {}, {}
-        query = select(
-            ExternalDbORM.external_db_id,
-            ExternalDbORM.db_name,
-            ExternalDbORM.db_display_name,
-        )
-        for row in core_dbi.execute(query).mappings().all():
-            name_to_external_name[row.db_name] = row.db_display_name
-
-        query = (
-            select(SourceUORM.source_id, SourceUORM.name)
-            .where(SourceUORM.source_id == XrefUORM.source_id)
-            .group_by(SourceUORM.source_id)
-        )
-        for row in xref_dbi.execute(query).mappings().all():
-            if name_to_external_name.get(row.name):
-                source_id_to_external_name[row.source_id] = name_to_external_name[
-                    row.name
-                ]
+        # Get source ID to external names mappings
+        source_id_to_external_name, name_to_source_id = self.get_external_name_mappings(core_dbi, xref_dbi)
 
         gene_count = 0
 
@@ -818,7 +722,7 @@ def set_display_xrefs_from_stable_table(self) -> None:
             )
 
             # Set description
-            if row.description is not None and row.description != "":
+            if row.description:
                 description = f"{row.description} [Source:{source_id_to_external_name[row.source_id]};Acc:{row.accession}]"
                 core_dbi.execute(
                     update(GeneORM)
diff --git a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
index e4c33bf75..82768724e 100644
--- a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
+++ b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
@@ -14,8 +14,25 @@
 
 """Mapper module for setting the feature names."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+import re
+from typing import Any, Dict, Tuple, List
+from sqlalchemy import select, func, update, case, desc, insert, aliased, delete
+from sqlalchemy.engine import Connection
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneTranscriptTranslation as GeneTranscriptTranslationORM,
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    Synonym as SynonymORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class OfficialNaming(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -33,7 +50,7 @@ def official_name(self, official_name: str = None) -> str:
     def run(self, species_id: int, verbose: bool) -> None:
         logging.info("Starting official naming")
 
-        # If no offical name then we do not want to go any further
+        # If no official name then we do not want to go any further
         dbname = self.official_name()
         if not dbname:
             self.update_process_status("official_naming_done")
@@ -42,30 +59,24 @@ def run(self, species_id: int, verbose: bool) -> None:
         xref_dbi = self.xref().connect()
 
         # If there are any official names on transcripts or translations, move them onto gene level
-        if dbname == "MGI":
-            self.biomart_fix("MGI", "Translation", "Gene", xref_dbi)
-            self.biomart_fix("MGI", "Transcript", "Gene", xref_dbi)
-        if dbname == "ZFIN_ID":
-            self.biomart_fix("ZFIN_ID", "Translation", "Gene", xref_dbi)
-            self.biomart_fix("ZFIN_ID", "Transcript", "Gene", xref_dbi)
-        if dbname == "RGD":
-            self.biomart_fix("RGD", "Translation", "Gene", xref_dbi)
-            self.biomart_fix("RGD", "Transcript", "Gene", xref_dbi)
+        for name in ["MGI", "ZFIN_ID", "RGD"]:
+            if dbname == name:
+                self.biomart_fix(name, "Translation", "Gene", xref_dbi)
+                self.biomart_fix(name, "Transcript", "Gene", xref_dbi)
 
         # Get the current max values for xref and object_xref
-        max_xref_id = xref_dbi.execute(select(func.max(XrefUORM.xref_id))).scalar()
-        max_xref_id = int(max_xref_id)
-        max_object_xref_id = xref_dbi.execute(
-            select(func.max(ObjectXrefUORM.object_xref_id))
-        ).scalar()
-        max_object_xref_id = int(max_object_xref_id)
+        max_xref_id = int(xref_dbi.execute(select(func.max(XrefUORM.xref_id))).scalar())
+        max_object_xref_id = int(xref_dbi.execute(select(func.max(ObjectXrefUORM.object_xref_id))).scalar())
 
         # Get labels, descriptions, and synonyms
         display_label_to_desc = self.get_display_label_data(dbname, xref_dbi)
-        synonyms = self.get_synonyms(dbname, xref_dbi)
 
         # Get source IDs
-        dbname_to_source_id = self.get_dbname_to_source_id(dbname, xref_id)
+        dbname_to_source_id = self.get_dbname_to_source_id(dbname, xref_dbi)
+
+        # Delete old data (from previous run)
+        logging.info(f"Deleting old data for sources: {', '.join(dbname_to_source_id.keys())}")
+        self.delete_old_data(dbname_to_source_id.values(), xref_dbi)
 
         # Reset gene and transcript stable id display data
         self.reset_display_xrefs(xref_dbi)
@@ -83,13 +94,12 @@ def run(self, species_id: int, verbose: bool) -> None:
             )
             .where(
                 GeneTranscriptTranslationORM.gene_id == GeneStableIdORM.internal_id,
-                GeneTranscriptTranslationORM.transcript_id
-                == TranscriptStableIdORM.internal_id,
+                GeneTranscriptTranslationORM.transcript_id == TranscriptStableIdORM.internal_id,
             )
             .order_by(GeneStableIdORM.stable_id, TranscriptStableIdORM.stable_id)
         )
         for row in xref_dbi.execute(query).mappings().all():
-            if not gene_to_transcripts.get(row.gene_id):
+            if row.gene_id not in gene_to_transcripts:
                 sorted_gene_ids.append(row.gene_id)
 
             gene_to_transcripts.setdefault(row.gene_id, []).append(row.transcript_id)
@@ -116,7 +126,7 @@ def run(self, species_id: int, verbose: bool) -> None:
             ObjectXrefUORM.ox_status == "DUMP_OUT",
         )
         for row in xref_dbi.execute(query).mappings().all():
-            ignore_object[row.object_xref_id] = 1
+            ignore_object[row.object_xref_id] = True
 
         xref_added, seen_gene, official_name_used = {}, {}, {}
 
@@ -125,7 +135,7 @@ def run(self, species_id: int, verbose: bool) -> None:
             transcript_source = dbname
             gene_symbol, gene_symbol_xref_id, is_lrg = None, None, 0
 
-            # Get offical name if it has one
+            # Get official name if it has one
             gene_symbol, gene_symbol_xref_id = self.get_official_domain_name(
                 {
                     "gene_id": gene_id,
@@ -138,22 +148,15 @@ def run(self, species_id: int, verbose: bool) -> None:
             )
 
             if gene_symbol_xref_id:
-                official_name_used[gene_symbol_xref_id] = 1
+                official_name_used[gene_symbol_xref_id] = True
 
             # If not found see if there is an LRG entry
             if not gene_symbol:
-                gene_symbol, gene_symbol_xref_id, is_lrg = self.find_lrg_hgnc(
-                    gene_id, xref_dbi
-                )
+                gene_symbol, gene_symbol_xref_id, is_lrg = self.find_lrg_hgnc(gene_id, xref_dbi)
 
             # If not found look for other valid database sources (RFAM and miRBase, EntrezGene)
             if not gene_symbol:
-                (
-                    gene_symbol,
-                    gene_symbol_xref_id,
-                    transcript_source,
-                    display_label_to_desc,
-                ) = self.find_from_other_sources(
+                gene_symbol, gene_symbol_xref_id, transcript_source, display_label_to_desc = self.find_from_other_sources(
                     ignore_object,
                     {
                         "gene_id": gene_id,
@@ -173,26 +176,22 @@ def run(self, species_id: int, verbose: bool) -> None:
 
                 if not is_lrg:
                     # Set transcript names
-                    max_xref_id, max_object_xref_id, xref_added, seen_gene = (
-                        self.set_transcript_display_xrefs(
-                            {
-                                "max_xref_id": max_xref_id,
-                                "max_object_xref_id": max_object_xref_id,
-                                "gene_id": gene_id,
-                                "gene_id_to_stable_id": gene_id_to_stable_id,
-                                "gene_symbol": gene_symbol,
-                                "description": description,
-                                "source_id": dbname_to_source_id.get(
-                                    f"{transcript_source}_trans_name"
-                                ),
-                                "xref_added": xref_added,
-                                "seen_gene": seen_gene,
-                                "transcript_ids": gene_to_transcripts.get(gene_id, []),
-                                "transcript_source": transcript_source,
-                                "species_id": species_id,
-                            },
-                            xref_dbi,
-                        )
+                    max_xref_id, max_object_xref_id = self.set_transcript_display_xrefs(
+                        {
+                            "max_xref_id": max_xref_id,
+                            "max_object_xref_id": max_object_xref_id,
+                            "gene_id": gene_id,
+                            "gene_id_to_stable_id": gene_id_to_stable_id,
+                            "gene_symbol": gene_symbol,
+                            "description": description,
+                            "source_id": dbname_to_source_id.get(f"{transcript_source}_trans_name"),
+                            "transcript_ids": gene_to_transcripts.get(gene_id, []),
+                            "transcript_source": transcript_source,
+                            "species_id": species_id,
+                        },
+                        xref_added,
+                        seen_gene,
+                        xref_dbi,
                     )
 
         xref_dbi.close()
@@ -217,49 +216,35 @@ def get_display_label_data(self, dbname: str, dbi: Connection) -> Dict[str, str]
             XrefUORM.source_id == SourceUORM.source_id, SourceUORM.name.like(dbname)
         )
         for row in dbi.execute(query).mappings().all():
-            if not row.description:
-                no_descriptions += 1
-            else:
+            if row.description:
                 label_to_desc[row.label] = row.description
+            else:
+                no_descriptions += 1
 
         if no_descriptions:
-            logging.warn(f"Descriptions not defined for {no_descriptions} labels")
+            logging.warning(f"Descriptions not defined for {no_descriptions} labels")
 
         return label_to_desc
 
-    def get_synonyms(self, dbname: str, dbi: Connection) -> Dict[str, str]:
-        synonyms = {}
-
-        # Connect synonyms with xref labels
-        query = select(SynonymORM.synonym, XrefUORM.label).where(
-            XrefUORM.xref_id == SynonymORM.xref_id,
-            SourceUORM.source_id == XrefUORM.source_id,
-            SourceUORM.name.like(dbname),
-        )
-        for row in dbi.execute(query).mappings().all():
-            synonyms[row.synonym] = row.label
-
-        return synonyms
-
     def get_dbname_to_source_id(self, dbname: str, dbi: Connection) -> Dict[str, int]:
         dbname_to_source_id = {}
 
+        # List of source names to look for
         sources_list = [
             "RFAM_trans_name",
             "miRBase_trans_name",
             "EntrezGene_trans_name",
+            f"{dbname}_trans_name",
         ]
-        sources_list.append(f"{dbname}_trans_name")
-        sources_list.append(dbname)
 
         source_error = 0
         for source_name in sources_list:
             source_id = dbi.execute(
-                select(SourceUORM.source_id).where(SourceUORM.name.like(source_name))
+                select(SourceUORM.source_id).where(SourceUORM.name == source_name)
             ).scalar()
 
             if not source_id:
-                logging.warn(f"Could not find external database '{source_name}'")
+                logging.warning(f"Could not find external database '{source_name}'")
                 source_error += 1
             else:
                 dbname_to_source_id[source_name] = source_id
@@ -271,6 +256,23 @@ def get_dbname_to_source_id(self, dbname: str, dbi: Connection) -> Dict[str, int
 
         return dbname_to_source_id
 
+    def delete_old_data(self, source_ids_to_delete: List[int], dbi: Connection) -> None:
+        # Delete from synonym
+        query = delete(SynonymORM).where(SynonymORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id.in_(source_ids_to_delete))
+        dbi.execute(query)
+
+        # Delete from identity_xref
+        query = delete(IdentityXrefUORM).where(IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id, ObjectXrefUORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id.in_(source_ids_to_delete))
+        dbi.execute(query)
+
+        # Delete from object_xref
+        query = delete(ObjectXrefUORM).where(ObjectXrefUORM.xref_id == XrefUORM.xref_id, XrefUORM.source_id.in_(source_ids_to_delete))
+        dbi.execute(query)
+
+        # Delete from xref
+        query = delete(XrefUORM).where(XrefUORM.source_id.in_(source_ids_to_delete))
+        dbi.execute(query)
+
     def reset_display_xrefs(self, dbi: Connection) -> None:
         dbi.execute(update(TranscriptStableIdORM).values(display_xref_id=None))
 
@@ -311,12 +313,12 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
 
             if row.priority < best_level:
                 display_names.clear()
-                display_names[row.xref_id] = 1
+                display_names[row.xref_id] = True
                 best_level = row.priority
             elif row.priority == best_level:
-                display_names[row.xref_id] = 1
+                display_names[row.xref_id] = True
 
-        # Check if the best names has been found, and remove the others if so
+        # Check if the best name has been found, and remove the others if so
         if name_count > 1 and len(display_names) == 1:
             if verbose:
                 logging.info(
@@ -336,13 +338,13 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
 
         # Perfect case, one best name found
         if len(display_names) == 1:
-            xref_id = display_names.keys()[0]
+            xref_id = next(iter(display_names))
             return xref_id_to_display[xref_id], xref_id
 
-        # Try to find the best names out of multiple ones
+        # Try to find the best name out of multiple ones
         if len(display_names) > 1:
             temp_best_identity = 0
-            best_ids, best_list = [], []
+            best_ids = {}
 
             # Fail xrefs with worse % identity if we can (query or target identity whichever is greater)
             case_stmt = case(
@@ -371,18 +373,15 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
             for row in dbi.execute(query).mappings().all():
                 if row.best_identity > temp_best_identity:
                     best_ids.clear()
-                    best_ids[row.xref_id] = 1
+                    best_ids[row.xref_id] = True
                     temp_best_identity = row.best_identity
                 elif row.best_identity == temp_best_identity:
-                    best_ids[row.xref_id] = 1
+                    best_ids[row.xref_id] = True
                 else:
                     break
 
-            for xref_id in display_names.keys():
-                best_list[xref_id_to_display[xref_id]] = 1
-
             # Check if we were able to reduce the number of xrefs based on % identity
-            if len(best_ids) > 0 and len(best_ids) < len(display_names):
+            if 0 < len(best_ids) < len(display_names):
                 display_names = best_ids
                 if verbose:
                     logging.info(
@@ -401,15 +400,12 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
                     return gene_symbol, gene_symbol_xref_id
 
             # Take the name which hasn't been already assigned to another gene, if possible
-            xref_not_used = None
-            for xref_id in display_names.keys():
-                if not official_name_used.get(xref_id):
-                    xref_not_used = xref_id
+            xref_not_used = next((xref_id for xref_id in display_names if not official_name_used.get(xref_id)), None)
 
             if xref_not_used:
                 if verbose:
                     logging.info(f"For gene {gene_id_to_stable_id[gene_id]}:")
-                for xref_id in display_names.keys():
+                for xref_id in display_names:
                     if xref_id == xref_not_used:
                         if verbose:
                             logging.info(f"\t{xref_id_to_display[xref_id]} chosen")
@@ -421,9 +417,8 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
                                 f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)"
                             )
             else:
-                index = 0
-                for xref_id in display_names.keys():
-                    if not index:
+                for index, xref_id in enumerate(display_names):
+                    if index == 0:
                         if verbose:
                             logging.info(
                                 f"\t{xref_id_to_display[xref_id]} chosen as first"
@@ -435,11 +430,10 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
                             logging.info(
                                 f"\t{xref_id_to_display[xref_id]} (left as {dbname} reference but not gene symbol)"
                             )
-                    index += 1
 
         return gene_symbol, gene_symbol_xref_id
 
-    def set_the_best_display_name(self, display_names: Dict[int, int], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]:
+    def set_the_best_display_name(self, display_names: Dict[int, bool], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]:
         gene_symbol, gene_symbol_xref_id = None, None
 
         for xref_id in xref_list:
@@ -462,7 +456,7 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]:
         gene_symbol, gene_symbol_xref_id = None, None
         is_lrg = False
 
-        # Look for LRG_HGNC_notransfer, if found then find HGNC equiv and set to this
+        # Look for LRG_HGNC_notransfer, if found then find HGNC equivalent and set to this
         query = select(
             XrefUORM.label,
             XrefUORM.xref_id,
@@ -476,11 +470,12 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]:
             ObjectXrefUORM.ensembl_object_type == "Gene",
         )
         for row in dbi.execute(query).mappings().all():
-            # Set status to NO_DISPLAY as we do not want this transferred, just the equivalent hgnc
+            # Set status to NO_DISPLAY as we do not want this transferred, just the equivalent HGNC
             self.update_object_xref_status(row.object_xref_id, "NO_DISPLAY")
 
-            new_xref_id, priority = None, None
-            query = (
+            # Find the equivalent HGNC xref
+            new_xref_id = None
+            result = dbi.execute(
                 select(XrefUORM.xref_id, SourceUORM.priority)
                 .where(
                     XrefUORM.xref_id == ObjectXrefUORM.xref_id,
@@ -490,10 +485,9 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]:
                     ObjectXrefUORM.ox_status == "DUMP_OUT",
                 )
                 .order_by(SourceUORM.priority)
-            )
-            result = dbi.execute(query).fetchall()
+            ).fetchall()
             if result:
-                new_xref_id, priority = result[0]
+                new_xref_id = result[0][0]
 
             if new_xref_id:
                 gene_symbol = row.label
@@ -502,7 +496,7 @@ def find_lrg_hgnc(self, gene_id: int, dbi: Connection) -> Tuple[str, int, bool]:
 
         return gene_symbol, gene_symbol_xref_id, is_lrg
 
-    def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any], dbi: Connection) -> Tuple[str, int, str, Dict[str, str]]:
+    def find_from_other_sources(self, ignore: Dict[int, bool], args: Dict[str, Any], dbi: Connection) -> Tuple[str, int, str, Dict[str, str]]:
         gene_id = args["gene_id"]
         display_label_to_desc = args["display_label_to_desc"]
         transcript_source = args["transcript_source"]
@@ -510,6 +504,7 @@ def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any],
         gene_symbol, gene_symbol_xref_id = None, None
         other_name_number, found_gene = {}, {}
 
+        # Iterate through the list of databases to find gene symbols
         for dbname in ["miRBase", "RFAM", "EntrezGene"]:
             query = select(
                 XrefUORM.label,
@@ -549,7 +544,7 @@ def find_from_other_sources(self, ignore: Dict[int, int], args: Dict[str, Any],
 
         return gene_symbol, gene_symbol_xref_id, transcript_source, display_label_to_desc
 
-    def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) -> Tuple[int, int, Dict[str, int], Dict[str, int]]:
+    def set_transcript_display_xrefs(self, args: Dict[str, Any], xref_added: Dict[str, int], seen_gene: Dict[str, int], dbi: Connection) -> Tuple[int, int]:
         max_xref_id = args["max_xref_id"]
         max_object_xref_id = args["max_object_xref_id"]
         gene_id = args["gene_id"]
@@ -557,21 +552,17 @@ def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) ->
         gene_symbol = args["gene_symbol"]
         description = args["description"]
         source_id = args["source_id"]
-        xref_added = args["xref_added"]
-        seen_gene = args["seen_gene"]
         transcript_ids = args["transcript_ids"]
         transcript_source = args["transcript_source"]
         species_id = args["species_id"]
 
-        # Do nothing is LRG
+        # Do nothing if LRG
         if re.search("LRG", gene_id_to_stable_id.get(gene_id)):
-            return
+            return max_xref_id, max_object_xref_id
 
-        ext = 201
-        if seen_gene.get(gene_symbol):
-            ext = seen_gene[gene_symbol]
+        ext = seen_gene.get(gene_symbol, 201)
 
-        # Go thourgh transcripts
+        # Go through transcripts
         for transcript_id in transcript_ids:
             transcript_name = f"{gene_symbol}-{ext}"
 
@@ -581,7 +572,7 @@ def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) ->
                 )
 
             index = f"{transcript_name}:{source_id}"
-            if not xref_added.get(index):
+            if index not in xref_added:
                 # Add new xref for the transcript name
                 max_xref_id += 1
                 dbi.execute(
@@ -634,4 +625,4 @@ def set_transcript_display_xrefs(self, args: Dict[str, Any], dbi: Connection) ->
 
         seen_gene[gene_symbol] = ext
 
-        return max_xref_id, max_object_xref_id, xref_added, seen_gene
+        return max_xref_id, max_object_xref_id
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py
index 53832520c..dccdd08f8 100644
--- a/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessMappings.py
@@ -14,8 +14,24 @@
 
 """Mapper module for processing sequence matched xref data."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import os
+import re
+import logging
+from sqlalchemy import select, insert, update, func
+from sqlalchemy.engine import Connection
+
+from ensembl.xrefs.xref_update_db_model import (
+    TranscriptStableId as TranscriptStableIdORM,
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    Mapping as MappingORM,
+    MappingJobs as MappingJobsORM,
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class ProcessMappings(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -51,9 +67,7 @@ def process_mappings(self) -> None:
             MappingJobsORM.job_id,
         )
         for mapping_job in xref_dbi.execute(mapping_query).mappings().all():
-            root_dir = mapping_job.root_dir
-            if root_dir is None:
-                root_dir = ""
+            root_dir = mapping_job.root_dir or ""
 
             err_file = os.path.join(root_dir, mapping_job.err_file)
             out_file = os.path.join(root_dir, mapping_job.out_file)
@@ -63,61 +77,62 @@ def process_mappings(self) -> None:
 
             if mapping_job.status == "SUCCESS":
                 already_processed_count += 1
-            else:
-                if os.path.exists(err_file) and os.path.getsize(err_file) > 0:
-                    error_count += 1
+                continue
 
-                    # Display errors on STDERR
-                    logging.warning(f"Problem {err_file} is non zero")
-                    try:
-                        with open(err_file) as fh:
-                            for line in fh:
-                                logging.warning(f"#{line}")
-                    except:
-                        logging.debug(
-                            f"No error file exists {err_file}???\n Resubmit this job"
-                        )
+            if os.path.exists(err_file) and os.path.getsize(err_file) > 0:
+                error_count += 1
 
-                    if mapping_job.status == "SUBMITTED":
-                        update_status = "FAILED"
-                else:
-                    # Process the mapping file
-                    if os.path.exists(map_file):
-                        count = self.process_map_file(
-                            map_file,
-                            query_cutoff[mapping_job.job_id],
-                            target_cutoff[mapping_job.job_id],
-                            mapping_job.job_id,
-                            mapping_job.array_number,
-                            xref_dbi,
-                        )
-                        if count > 0:
-                            processed_count += 1
-                            update_status = "SUCCESS"
-                        elif count == 0:
-                            processed_count += 1
-                            empty_count += 1
-                            update_status = "SUCCESS"
-                        else:
-                            error_count += 1
-                            update_status = "FAILED"
+                # Display errors on STDERR
+                logging.warning(f"Problem {err_file} is non zero")
+                try:
+                    with open(err_file) as fh:
+                        for line in fh:
+                            logging.warning(f"#{line}")
+                except Exception as e:
+                    logging.debug(
+                        f"No error file exists {err_file}???\n Resubmit this job. Error: {e}"
+                    )
+
+                if mapping_job.status == "SUBMITTED":
+                    update_status = "FAILED"
+            else:
+                # Process the mapping file
+                if os.path.exists(map_file):
+                    count = self.process_map_file(
+                        map_file,
+                        query_cutoff[mapping_job.job_id],
+                        target_cutoff[mapping_job.job_id],
+                        mapping_job.job_id,
+                        mapping_job.array_number,
+                        xref_dbi,
+                    )
+                    if count > 0:
+                        processed_count += 1
+                        update_status = "SUCCESS"
+                    elif count == 0:
+                        processed_count += 1
+                        empty_count += 1
+                        update_status = "SUCCESS"
                     else:
                         error_count += 1
-                        logging.debug(
-                            f"Could not open map file {map_file}???\n Resubmit this job"
-                        )
                         update_status = "FAILED"
-
-                # Update mapping job status
-                if update_status:
-                    xref_dbi.execute(
-                        update(MappingJobsORM)
-                        .where(
-                            MappingJobsORM.job_id == mapping_job.job_id,
-                            MappingJobsORM.array_number == mapping_job.array_number,
-                        )
-                        .values(status=update_status)
+                else:
+                    error_count += 1
+                    logging.debug(
+                        f"Could not open map file {map_file}???\n Resubmit this job"
+                    )
+                    update_status = "FAILED"
+
+            # Update mapping job status
+            if update_status:
+                xref_dbi.execute(
+                    update(MappingJobsORM)
+                    .where(
+                        MappingJobsORM.job_id == mapping_job.job_id,
+                        MappingJobsORM.array_number == mapping_job.array_number,
                     )
+                    .values(status=update_status)
+                )
 
         logging.info(
             f"Already processed = {already_processed_count}, processed = {processed_count}, errors = {error_count}, empty = {empty_count}"
@@ -136,13 +151,12 @@ def process_map_file(self, map_file: str, query_cutoff: int, target_cutoff: int,
         # Get max object xref id
         object_xref_id = dbi.execute(
             select(func.max(ObjectXrefUORM.object_xref_id))
-        ).scalar()
-        if not object_xref_id:
-            object_xref_id = 0
+        ).scalar() or 0
 
         total_lines, last_query_id = 0, 0
-        best_match_found, best_identity, best_score = 0, 0, 0
-        first = 1
+        best_match_found = False
+        best_identity, best_score = 0, 0
+        first = True
 
         mRNA_biotypes = {
             "protein_coding": 1,
@@ -153,210 +167,208 @@ def process_map_file(self, map_file: str, query_cutoff: int, target_cutoff: int,
         }
 
         try:
-            mh = open(map_file)
-        except:
-            logging.debug(f"Could not open map file {map_file}\n Resubmit this job")
-            return -1
+            with open(map_file) as mh:
+                for line in mh:
+                    load_object_xref = False
+                    total_lines += 1
 
-        for line in mh:
-            load_object_xref = 0
-            total_lines += 1
-
-            (
-                label,
-                query_id,
-                target_id,
-                identity,
-                query_length,
-                target_length,
-                query_start,
-                query_end,
-                target_start,
-                target_end,
-                cigar_line,
-                score,
-            ) = line.strip().split(":")
-
-            # Fix varibale types (for integer comparisons)
-            identity = int(identity)
-            score = int(score)
-            query_length = int(query_length)
-            target_length = int(target_length)
-            query_start = int(query_start)
-            target_start = int(target_start)
-
-            if last_query_id != query_id:
-                best_match_found = 0
-                best_score = 0
-                best_identity = 0
-            else:
-                # Ignore mappings with worse identity or score if we already found a good mapping
-                if (
-                    identity < best_identity or score < best_score
-                ) and best_match_found:
-                    continue
-
-            if ensembl_type == "Translation":
-                load_object_xref = 1
-            else:
-                # Check if source name is RefSeq_ncRNA or RefSeq_mRNA
-                # If yes check biotype, if ok store object xref
-                source_name = dbi.execute(
-                    select(SourceUORM.name)
-                    .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
-                    .where(XrefUORM.xref_id == query_id)
-                ).scalar()
-
-                if source_name and (
-                    re.search(r"^RefSeq_(m|nc)RNA", source_name)
-                    or re.search(r"^miRBase", source_name)
-                    or re.search(r"^RFAM", source_name)
-                ):
-                    # Make sure mRNA xrefs are matched to protein_coding biotype only
-                    biotype = dbi.execute(
-                        select(TranscriptStableIdORM.biotype).where(
-                            TranscriptStableIdORM.internal_id == target_id
-                        )
-                    ).scalar()
-
-                    if re.search(r"^RefSeq_mRNA", source_name) and mRNA_biotypes.get(
-                        biotype
-                    ):
-                        load_object_xref = 1
-                    if re.search(
-                        r"^RefSeq_ncRNA", source_name
-                    ) and not mRNA_biotypes.get(biotype):
-                        load_object_xref = 1
-                    if (
-                        re.search(r"^miRBase", source_name)
-                        or re.search(r"^RFAM", source_name)
-                    ) and re.search("RNA", biotype):
-                        load_object_xref = 1
-                else:
-                    load_object_xref = 1
+                    (
+                        label,
+                        query_id,
+                        target_id,
+                        identity,
+                        query_length,
+                        target_length,
+                        query_start,
+                        query_end,
+                        target_start,
+                        target_end,
+                        cigar_line,
+                        score,
+                    ) = line.strip().split(":")
+
+                    # Fix variable types (for integer comparisons)
+                    identity = int(identity)
+                    score = int(score)
+                    query_length = int(query_length)
+                    target_length = int(target_length)
+                    query_start = int(query_start)
+                    target_start = int(target_start)
+
+                    if last_query_id != query_id:
+                        best_match_found = False
+                        best_score = 0
+                        best_identity = 0
+                    else:
+                        # Ignore mappings with worse identity or score if we already found a good mapping
+                        if (
+                            (identity < best_identity or score < best_score)
+                            and best_match_found
+                        ):
+                            continue
+
+                    if ensembl_type == "Translation":
+                        load_object_xref = True
+                    else:
+                        # Check if source name is RefSeq_ncRNA or RefSeq_mRNA
+                        # If yes check biotype, if ok store object xref
+                        source_name = dbi.execute(
+                            select(SourceUORM.name)
+                            .join(XrefUORM, XrefUORM.source_id == SourceUORM.source_id)
+                            .where(XrefUORM.xref_id == query_id)
+                        ).scalar()
+
+                        if source_name and (
+                            re.search(r"^RefSeq_(m|nc)RNA", source_name)
+                            or re.search(r"^miRBase", source_name)
+                            or re.search(r"^RFAM", source_name)
+                        ):
+                            # Make sure mRNA xrefs are matched to protein_coding biotype only
+                            biotype = dbi.execute(
+                                select(TranscriptStableIdORM.biotype).where(
+                                    TranscriptStableIdORM.internal_id == target_id
+                                )
+                            ).scalar()
+
+                            if re.search(r"^RefSeq_mRNA", source_name) and mRNA_biotypes.get(
+                                biotype
+                            ):
+                                load_object_xref = True
+                            if re.search(
+                                r"^RefSeq_ncRNA", source_name
+                            ) and not mRNA_biotypes.get(biotype):
+                                load_object_xref = True
+                            if (
+                                re.search(r"^miRBase", source_name)
+                                or re.search(r"^RFAM", source_name)
+                            ) and re.search("RNA", biotype):
+                                load_object_xref = True
+                        else:
+                            load_object_xref = True
 
-            last_query_id = query_id
+                    last_query_id = query_id
 
-            # Check if found a better match
-            if score > best_score or identity > best_identity:
-                best_score = score
-                best_identity = identity
+                    # Check if found a better match
+                    if score > best_score or identity > best_identity:
+                        best_score = score
+                        best_identity = identity
 
-            if not load_object_xref:
-                continue
-            else:
-                best_match_found = 1
+                    if not load_object_xref:
+                        continue
+                    else:
+                        best_match_found = True
 
-            if not score:
-                self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
-                raise ValueError(f"No score on line. Possible file corruption\n{line}")
+                    if not score:
+                        self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
+                        raise ValueError(f"No score on line. Possible file corruption\n{line}")
 
-            # Calculate percentage identities
-            query_identity = int(100 * identity / query_length)
-            target_identity = int(100 * identity / target_length)
+                    # Calculate percentage identities
+                    query_identity = int(100 * identity / query_length)
+                    target_identity = int(100 * identity / target_length)
 
-            # Only keep alignments where both sequences match cutoff
-            status = "DUMP_OUT"
-            if query_identity < query_cutoff or target_identity < target_cutoff:
-                status = "FAILED_CUTOFF"
+                    # Only keep alignments where both sequences match cutoff
+                    status = "DUMP_OUT"
+                    if query_identity < query_cutoff or target_identity < target_cutoff:
+                        status = "FAILED_CUTOFF"
 
-            # Add object xref row
-            object_xref_id = self.get_object_xref_id(
-                target_id, query_id, ensembl_type, "SEQUENCE_MATCH", dbi, None, status
-            )
-            if object_xref_id:
-                continue
-            else:
-                try:
-                    object_xref_id = self.add_object_xref(
-                        target_id,
-                        query_id,
-                        ensembl_type,
-                        "SEQUENCE_MATCH",
-                        dbi,
-                        None,
-                        status,
+                    # Add object xref row
+                    object_xref_id = self.get_object_xref_id(
+                        target_id, query_id, ensembl_type, "SEQUENCE_MATCH", dbi, None, status
                     )
-                except:
-                    self.update_object_xref_end(
-                        job_id, array_number, object_xref_id, dbi
-                    )
-                    raise IOError(f"Problem adding object_xref row")
-
-            if first:
-                self.update_object_xref_start(job_id, array_number, object_xref_id, dbi)
-                first = 0
-
-            cigar_line = re.sub(" ", "", cigar_line)
-            cigar_line = re.sub(r"([MDI])(\d+)", r"\2\1", cigar_line)
-
-            # Add identity xref row
-            try:
-                identity_xref_query = insert(IdentityXrefUORM).values(
-                    object_xref_id=object_xref_id,
-                    query_identity=query_identity,
-                    target_identity=target_identity,
-                    hit_start=query_start + 1,
-                    hit_end=query_end,
-                    translation_start=target_start + 1,
-                    translation_end=target_end,
-                    cigar_line=cigar_line,
-                    score=score,
-                )
-                dbi.execute(identity_xref_query)
-            except:
-                self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
-                raise IOError(f"Problem loading identity_xref")
-
-            master_xref_ids = [query_id]
-            for master_xref_id in master_xref_ids:
-                # Get all dependents related to master xref
-                dep_query = select(DependentXrefUORM.dependent_xref_id).where(
-                    DependentXrefUORM.master_xref_id == master_xref_id
-                )
-                for dep in dbi.execute(dep_query).mappings().all():
-                    # Add dependent object xref
-                    dep_object_xref_id = self.get_object_xref_id(
-                        target_id,
-                        dep.dependent_xref_id,
-                        ensembl_type,
-                        "DEPENDENT",
-                        dbi,
-                        master_xref_id,
-                        status,
-                    )
-                    if dep_object_xref_id:
+                    if object_xref_id:
                         continue
                     else:
                         try:
-                            dep_object_xref_id = self.add_object_xref(
+                            object_xref_id = self.add_object_xref(
                                 target_id,
-                                dep.dependent_xref_id,
+                                query_id,
                                 ensembl_type,
-                                "DEPENDENT",
+                                "SEQUENCE_MATCH",
                                 dbi,
-                                master_xref_id,
+                                None,
                                 status,
                             )
                         except:
                             self.update_object_xref_end(
                                 job_id, array_number, object_xref_id, dbi
                             )
-                            raise IOError(f"Problem adding dependent object xref row")
+                            raise IOError(f"Problem adding object_xref row")
 
-                    # Add dependent identity xref
-                    dbi.execute(
-                        insert(IdentityXrefUORM).values(
-                            object_xref_id=dep_object_xref_id,
+                    if first:
+                        self.update_object_xref_start(job_id, array_number, object_xref_id, dbi)
+                        first = False
+
+                    cigar_line = re.sub(" ", "", cigar_line)
+                    cigar_line = re.sub(r"([MDI])(\d+)", r"\2\1", cigar_line)
+
+                    # Add identity xref row
+                    try:
+                        identity_xref_query = insert(IdentityXrefUORM).values(
+                            object_xref_id=object_xref_id,
                             query_identity=query_identity,
                             target_identity=target_identity,
+                            hit_start=query_start + 1,
+                            hit_end=query_end,
+                            translation_start=target_start + 1,
+                            translation_end=target_end,
+                            cigar_line=cigar_line,
+                            score=score,
                         )
-                    )
-
-                    # Get the dependent dependents just in case
-                    master_xref_ids.append(dep.dependent_xref_id)
+                        dbi.execute(identity_xref_query)
+                    except:
+                        self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
+                        raise IOError(f"Problem loading identity_xref")
+
+                    master_xref_ids = [query_id]
+                    for master_xref_id in master_xref_ids:
+                        # Get all dependents related to master xref
+                        dep_query = select(DependentXrefUORM.dependent_xref_id).where(
+                            DependentXrefUORM.master_xref_id == master_xref_id
+                        )
+                        for dep in dbi.execute(dep_query).mappings().all():
+                            # Add dependent object xref
+                            dep_object_xref_id = self.get_object_xref_id(
+                                target_id,
+                                dep.dependent_xref_id,
+                                ensembl_type,
+                                "DEPENDENT",
+                                dbi,
+                                master_xref_id,
+                                status,
+                            )
+                            if dep_object_xref_id:
+                                continue
+                            else:
+                                try:
+                                    dep_object_xref_id = self.add_object_xref(
+                                        target_id,
+                                        dep.dependent_xref_id,
+                                        ensembl_type,
+                                        "DEPENDENT",
+                                        dbi,
+                                        master_xref_id,
+                                        status,
+                                    )
+                                except:
+                                    self.update_object_xref_end(
+                                        job_id, array_number, object_xref_id, dbi
+                                    )
+                                    raise IOError(f"Problem adding dependent object xref row")
+
+                            # Add dependent identity xref
+                            dbi.execute(
+                                insert(IdentityXrefUORM).values(
+                                    object_xref_id=dep_object_xref_id,
+                                    query_identity=query_identity,
+                                    target_identity=target_identity,
+                                )
+                            )
 
-        mh.close()
+                            # Get the dependent dependents just in case
+                            master_xref_ids.append(dep.dependent_xref_id)
+        except Exception as e:
+            logging.debug(f"Could not open map file {map_file}\n Resubmit this job. Error: {e}")
+            return -1
 
         self.update_object_xref_end(job_id, array_number, object_xref_id, dbi)
         return total_lines
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py
index c086cab01..d4085640e 100644
--- a/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessMoves.py
@@ -14,8 +14,30 @@
 
 """Mapper module for moving xref data onto appriopriate genes."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from typing import List, Tuple, Dict
+from sqlalchemy import select, func, update, delete, insert
+from sqlalchemy.engine import Connection
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneTranscriptTranslation as GeneTranscriptTranslationORM,
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    TranslationStableId as TranslationStableIdORM,
+    ObjectXref as ObjectXrefUORM,
+    AltAllele as AltAlleleUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    GeneDirectXref as GeneDirectXrefORM,
+    TranscriptDirectXref as TranscriptDirectXrefORM,
+    TranslationDirectXref as TranslationDirectXrefORM,
+    Synonym as SynonymORM,
+    PrimaryXref as PrimaryXrefORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class ProcessMoves(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -28,11 +50,11 @@ def biomart_testing(self, verbose: bool) -> None:
 
         xref_dbi = self.xref().connect()
 
-        again = 1
+        again = True
         while again:
-            again = 0
+            again = False
 
-            last_type, last_count, last_name = None, None, "DEFAULT"
+            last_type, last_name = None, "DEFAULT"
 
             query = (
                 select(
@@ -48,18 +70,15 @@ def biomart_testing(self, verbose: bool) -> None:
                 .group_by(SourceUORM.name, ObjectXrefUORM.ensembl_object_type)
             )
             for row in xref_dbi.execute(query).mappings().all():
-                if again:
-                    break
-
                 if last_name == row.name:
-                    again = 1
+                    again = True
                     self.biomart_fix(
                         row.name, last_type, row.ensembl_object_type, xref_dbi
                     )
+                    break
 
                 last_name = row.name
                 last_type = row.ensembl_object_type
-                last_count = row.count
 
         if self.unlinked_entries(verbose, xref_dbi):
             raise ValueError("Problems found before source_defined_move")
@@ -70,60 +89,41 @@ def biomart_testing(self, verbose: bool) -> None:
 
     def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool:
         failed = False
-        xref_id, count = None, None
 
         self.update_process_status("tests_started")
 
+        def log_problems(count, description, query):
+            nonlocal failed
+            if count:
+                failed = True
+                logging.error(f"Problem with {count} {description}s")
+                if verbose:
+                    for row in dbi.execute(query).mappings().all():
+                        logging.error(f"Problem with {description} {row.log_xref_id}")
+
         # Get count of unlinked master xrefs
         count = dbi.execute(
             select(func.count(DependentXrefUORM.master_xref_id))
             .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id)
             .where(XrefUORM.xref_id == None)
         ).scalar()
-
-        if count:
-            failed = True
-            logging.error(f"Problem with {count} master xrefs")
-
-            if verbose:
-                query = (
-                    select(DependentXrefUORM.master_xref_id.distinct())
-                    .outerjoin(
-                        XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id
-                    )
-                    .where(XrefUORM.xref_id == None)
-                    .limit(10)
-                )
-                for row in dbi.execute(query).mappings().all():
-                    logging.error(f"Problem with master xref {row.master_xref_id}")
+        log_problems(count, "master xref", 
+                     select(DependentXrefUORM.master_xref_id.distinct().label("log_xref_id"))
+                     .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.master_xref_id)
+                     .where(XrefUORM.xref_id == None)
+                     .limit(10))
 
         # Get count of unlinked dependent xrefs
         count = dbi.execute(
             select(func.count(DependentXrefUORM.dependent_xref_id))
-            .outerjoin(
-                XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id
-            )
+            .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id)
             .where(XrefUORM.xref_id == None)
         ).scalar()
-
-        if count:
-            failed = True
-            logging.error(f"Problem with {count} dependent xrefs")
-
-            if verbose:
-                query = (
-                    select(DependentXrefUORM.dependent_xref_id.distinct())
-                    .outerjoin(
-                        XrefUORM,
-                        XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id,
-                    )
-                    .where(XrefUORM.xref_id == None)
-                    .limit(10)
-                )
-                for row in dbi.execute(query).mappings().all():
-                    logging.error(
-                        f"Problem with dependent xref {row.dependent_xref_id}"
-                    )
+        log_problems(count, "dependent xref", 
+                     select(DependentXrefUORM.dependent_xref_id.distinct().label("log_xref_id"))
+                     .outerjoin(XrefUORM, XrefUORM.xref_id == DependentXrefUORM.dependent_xref_id)
+                     .where(XrefUORM.xref_id == None)
+                     .limit(10))
 
         # Get count of unlinked primary xrefs
         count = dbi.execute(
@@ -131,59 +131,31 @@ def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool:
             .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id)
             .where(XrefUORM.xref_id == None)
         ).scalar()
-
-        if count:
-            failed = True
-            logging.error(f"Problem with {count} primary xrefs")
-
-            if verbose:
-                query = (
-                    select(PrimaryXrefORM.xref_id.distinct())
-                    .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id)
-                    .where(XrefUORM.xref_id == None)
-                    .limit(10)
-                )
-                for row in dbi.execute(query).mappings().all():
-                    logging.error(f"Problem with primary xref {row.xref_id}")
+        log_problems(count, "primary xref", 
+                     select(PrimaryXrefORM.xref_id.distinct().label("log_xref_id"))
+                     .outerjoin(XrefUORM, XrefUORM.xref_id == PrimaryXrefORM.xref_id)
+                     .where(XrefUORM.xref_id == None)
+                     .limit(10))
 
         db_tables = {
+            "transcript": {"direct": TranscriptDirectXrefORM, "stable_id": TranscriptStableIdORM},
+            "translation": {"direct": TranslationDirectXrefORM, "stable_id": TranslationStableIdORM},
             "gene": {"direct": GeneDirectXrefORM, "stable_id": GeneStableIdORM},
-            "transcript": {
-                "direct": TranscriptDirectXrefORM,
-                "stable_id": TranscriptStableIdORM,
-            },
-            "translation": {
-                "direct": TranslationDirectXrefORM,
-                "stable_id": TranslationStableIdORM,
-            },
         }
 
         # Get count of unlinked direct xrefs
-        for object_type in ["transcript", "translation", "gene"]:
-            direct_table = db_tables[object_type]["direct"]
+        for object_type, tables in db_tables.items():
+            direct_table = tables["direct"]
             count = dbi.execute(
                 select(func.count(direct_table.general_xref_id))
                 .outerjoin(XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id)
                 .where(XrefUORM.xref_id == None)
             ).scalar()
-
-            if count:
-                failed = True
-                logging.error(f"Problem with {count} {object_type} direct xrefs")
-
-                if verbose:
-                    query = (
-                        select(direct_table.general_xref_id.distinct())
-                        .outerjoin(
-                            XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id
-                        )
-                        .where(XrefUORM.xref_id == None)
-                        .limit(10)
-                    )
-                    for row in dbi.execute(query).mappings().all():
-                        logging.error(
-                            f"Problem with {object_type} direct xref {row.general_xref_id}"
-                        )
+            log_problems(count, f"{object_type} direct xref", 
+                         select(direct_table.general_xref_id.distinct().label("log_xref_id"))
+                         .outerjoin(XrefUORM, XrefUORM.xref_id == direct_table.general_xref_id)
+                         .where(XrefUORM.xref_id == None)
+                         .limit(10))
 
         # Get count of unlinked synonyms
         count = dbi.execute(
@@ -191,280 +163,219 @@ def unlinked_entries(self, verbose: bool, dbi: Connection) -> bool:
             .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id)
             .where(XrefUORM.xref_id == None)
         ).scalar()
-
-        if count:
-            failed = True
-            logging.error(f"Problem with {count} synonyms")
-
-            if verbose:
-                query = (
-                    select(SynonymORM.xref_id.distinct())
-                    .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id)
-                    .where(XrefUORM.xref_id == None)
-                    .limit(10)
-                )
-                for row in dbi.execute(query).mappings().all():
-                    logging.error(f"Problem with synonym {row.xref_id}")
+        log_problems(count, "synonym", 
+                     select(SynonymORM.xref_id.distinct().label("log_xref_id"))
+                     .outerjoin(XrefUORM, XrefUORM.xref_id == SynonymORM.xref_id)
+                     .where(XrefUORM.xref_id == None)
+                     .limit(10))
 
         # Get count of unlinked identity object xrefs
         count = dbi.execute(
             select(func.count(IdentityXrefUORM.object_xref_id))
-            .outerjoin(
-                ObjectXrefUORM,
-                ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
-            )
+            .outerjoin(ObjectXrefUORM, ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id)
             .where(ObjectXrefUORM.object_xref_id == None)
         ).scalar()
-
-        if count:
-            failed = True
-            logging.error(f"Problem with {count} object xrefs")
-
-            if verbose:
-                query = (
-                    select(IdentityXrefUORM.object_xref_id.distinct())
-                    .outerjoin(
-                        ObjectXrefUORM,
-                        ObjectXrefUORM.object_xref_id
-                        == IdentityXrefUORM.object_xref_id,
-                    )
-                    .where(ObjectXrefUORM.object_xref_id == None)
-                    .limit(10)
-                )
-                for row in dbi.execute(query).mappings().all():
-                    logging.error(f"Problem with object xref {row.object_xref_id}")
+        log_problems(count, "object xref", 
+                     select(IdentityXrefUORM.object_xref_id.distinct().label("log_xref_id"))
+                     .outerjoin(ObjectXrefUORM, ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id)
+                     .where(ObjectXrefUORM.object_xref_id == None)
+                     .limit(10))
 
         # Get count of unlinked objects
-        for object_type in ["transcript", "translation", "gene"]:
+        for object_type, tables in db_tables.items():
             id_column = getattr(GeneTranscriptTranslationORM, f"{object_type}_id")
-            stable_id_table = db_tables[object_type]["stable_id"]
+            stable_id_table = tables["stable_id"]
 
             count = dbi.execute(
                 select(func.count(id_column))
                 .outerjoin(stable_id_table, stable_id_table.internal_id == id_column)
                 .where(stable_id_table.internal_id == None, id_column != None)
             ).scalar()
+            log_problems(count, f"{object_type}_ids", 
+                         select(id_column.label("object_id").distinct())
+                         .outerjoin(stable_id_table, stable_id_table.internal_id == id_column)
+                         .where(stable_id_table.internal_id == None, id_column != None)
+                         .limit(10))
 
-            if count:
-                failed = True
-                logging.error(f"Problem with {count} {object_type}_ids")
-
-                if verbose:
-                    query = (
-                        select(id_column.label("object_id").distinct())
-                        .outerjoin(
-                            stable_id_table, stable_id_table.internal_id == id_column
-                        )
-                        .where(stable_id_table.internal_id == None, id_column != None)
-                        .limit(10)
-                    )
-                    for row in dbi.execute(query).mappings().all():
-                        logging.error(f"Problem with {object_type}_id {row.object_id}")
-
-        if not failed:
-            self.update_process_status("tests_finished")
-        else:
-            self.update_process_status("tests_failed")
+        self.update_process_status("tests_finished" if not failed else "tests_failed")
 
         return failed
 
     def source_defined_move(self, verbose: bool) -> None:
-        xref_dbi = self.xref().connect()
+        logging.info("Starting source defined move")
 
-        for source in self.get_gene_specific_list(xref_dbi):
-            self.biomart_fix(source, "Translation", "Gene", xref_dbi)
-            self.biomart_fix(source, "Transcript", "Gene", xref_dbi)
+        with self.xref().connect() as xref_dbi:
+            for source in self.get_gene_specific_list(xref_dbi):
+                logging.info(f"Processing source: {source}")
+                self.biomart_fix(source, "Translation", "Gene", xref_dbi)
+                self.biomart_fix(source, "Transcript", "Gene", xref_dbi)
 
-        if self.unlinked_entries(verbose, xref_dbi):
-            raise ValueError("Problems found after source_defined_move")
-
-        xref_dbi.close()
+            if self.unlinked_entries(verbose, xref_dbi):
+                raise ValueError("Problems found after source_defined_move")
 
         self.update_process_status("source_level_move_finished")
+        logging.info("Source defined move finished")
 
     def get_gene_specific_list(self, dbi: Connection) -> List[str]:
         sources_list = [
-            "DBASS3",
-            "DBASS5",
-            "EntrezGene",
-            "miRBase",
-            "RFAM",
-            "TRNASCAN_SE",
-            "RNAMMER",
-            "UniGene",
-            "Uniprot_gn",
-            "WikiGene",
-            "MIM_GENE",
-            "MIM_MORBID",
-            "HGNC",
-            "MGI",
-            "ZFIN_ID",
-            "FlyBaseName_gene",
-            "RGD",
-            "SGD_GENE",
-            "VGNC",
-            "wormbase_gseqname",
-            "wormbase_locus",
-            "Xenbase",
-            "GeneCards",
+            "DBASS3", "DBASS5", "EntrezGene", "miRBase", "RFAM", "TRNASCAN_SE",
+            "RNAMMER", "UniGene", "Uniprot_gn", "WikiGene", "MIM_GENE", "MIM_MORBID",
+            "HGNC", "MGI", "ZFIN_ID", "FlyBaseName_gene", "RGD", "SGD_GENE", "VGNC",
+            "wormbase_gseqname", "wormbase_locus", "Xenbase", "GeneCards",
         ]
 
-        used_list = []
-        count = None
-
         # Check that the sources are used in the database considered
-        for source in sources_list:
-            count = dbi.execute(
+        used_list = [
+            source for source in sources_list
+            if dbi.execute(
                 select(func.count(XrefUORM.xref_id)).where(
                     XrefUORM.source_id == SourceUORM.source_id,
                     SourceUORM.name == source,
                 )
-            ).scalar()
-
-            if count > 0:
-                used_list.append(source)
+            ).scalar() > 0
+        ]
 
         return used_list
 
     def process_alt_alleles(self, verbose: bool) -> None:
         logging.info("Processing alt alleles")
 
-        xref_dbi = self.xref().connect()
+        with self.xref().connect() as xref_dbi:
+            alt_to_ref, ref_to_alts = self.get_alt_allele_hashes(xref_dbi)
+            gene_specific_list = self.get_gene_specific_list(xref_dbi)
 
-        alt_to_ref, ref_to_alts = self.get_alt_allele_hashes(xref_dbi)
-        gene_specific_list = self.get_gene_specific_list(xref_dbi)
+            move_count, del_identity_xref_count, del_object_xref_count = 0, 0, 0
 
-        move_count, del_identity_xref_count, del_object_xref_count = 0, 0, 0
+            for gene_id, ref_gene in alt_to_ref.items():
+                # Move the xrefs onto the reference Gene
+                query = (
+                    update(ObjectXrefUORM)
+                    .where(
+                        XrefUORM.source_id == SourceUORM.source_id,
+                        ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                        ObjectXrefUORM.ensembl_id == gene_id,
+                        ObjectXrefUORM.ensembl_object_type == "Gene",
+                        ObjectXrefUORM.ox_status == "DUMP_OUT",
+                        SourceUORM.name.in_(gene_specific_list),
+                    )
+                    .values(ensembl_id=ref_gene)
+                    .prefix_with("IGNORE")
+                )
+                row_count = xref_dbi.execute(query).rowcount
+                move_count += row_count
 
-        for gene_id, ref_gene in alt_to_ref.items():
-            # Move the xrefs onto the reference Gene
-            query = (
-                update(ObjectXrefUORM)
-                .where(
+                # Delete the related identity and object xrefs
+                query = delete(IdentityXrefUORM).where(
                     XrefUORM.source_id == SourceUORM.source_id,
+                    ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
                     ObjectXrefUORM.xref_id == XrefUORM.xref_id,
                     ObjectXrefUORM.ensembl_id == gene_id,
                     ObjectXrefUORM.ensembl_object_type == "Gene",
                     ObjectXrefUORM.ox_status == "DUMP_OUT",
                     SourceUORM.name.in_(gene_specific_list),
                 )
-                .values(ensembl_id=ref_gene)
-                .prefix_with("IGNORE")
-            )
-            row_count = xref_dbi.execute(query).rowcount
-            move_count += row_count
-
-            # Delete the related identity and object xrefs
-            query = delete(IdentityXrefUORM).where(
-                XrefUORM.source_id == SourceUORM.source_id,
-                ObjectXrefUORM.object_xref_id == IdentityXrefUORM.object_xref_id,
-                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
-                ObjectXrefUORM.ensembl_id == gene_id,
-                ObjectXrefUORM.ensembl_object_type == "Gene",
-                ObjectXrefUORM.ox_status == "DUMP_OUT",
-                SourceUORM.name.in_(gene_specific_list),
-            )
-            row_count = xref_dbi.execute(query).rowcount
-            del_identity_xref_count += row_count
-
-            query = delete(ObjectXrefUORM).where(
-                XrefUORM.source_id == SourceUORM.source_id,
-                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
-                ObjectXrefUORM.ensembl_id == gene_id,
-                ObjectXrefUORM.ensembl_object_type == "Gene",
-                ObjectXrefUORM.ox_status == "DUMP_OUT",
-                SourceUORM.name.in_(gene_specific_list),
-            )
-            row_count = xref_dbi.execute(query).rowcount
-            del_object_xref_count += row_count
-
-        logging.info(
-            f"Number of rows: moved = {move_count}, identity_xrefs deleted = {del_identity_xref_count}, object_xrefs deleted = {del_object_xref_count}"
-        )
-
-        max_object_xref_id = xref_dbi.execute(
-            select(func.max(ObjectXrefUORM.object_xref_id))
-        ).scalar()
-        max_object_xref_id = int(max_object_xref_id)
-
-        if not max_object_xref_id:
-            raise LookupError("Problem getting max object_xref_id")
+                row_count = xref_dbi.execute(query).rowcount
+                del_identity_xref_count += row_count
 
-        added_count, ignored = 0, 0
-
-        # Copy the xref data related to the reference gene onto the alt alleles
-        for ref_gene, alts in ref_to_alts.items():
-            # Get object and identity xref data related to the reference gene
-            query = (
-                select(ObjectXrefUORM, IdentityXrefUORM)
-                .outerjoin(
-                    IdentityXrefUORM,
-                    IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
-                )
-                .where(
+                query = delete(ObjectXrefUORM).where(
                     XrefUORM.source_id == SourceUORM.source_id,
                     ObjectXrefUORM.xref_id == XrefUORM.xref_id,
-                    ObjectXrefUORM.ensembl_id == ref_gene,
-                    ObjectXrefUORM.ox_status == "DUMP_OUT",
+                    ObjectXrefUORM.ensembl_id == gene_id,
                     ObjectXrefUORM.ensembl_object_type == "Gene",
+                    ObjectXrefUORM.ox_status == "DUMP_OUT",
                     SourceUORM.name.in_(gene_specific_list),
                 )
+                row_count = xref_dbi.execute(query).rowcount
+                del_object_xref_count += row_count
+
+            logging.info(
+                f"Number of rows: moved = {move_count}, identity_xrefs deleted = {del_identity_xref_count}, object_xrefs deleted = {del_object_xref_count}"
             )
-            for row in xref_dbi.execute(query).mappings().all():
-                for alt in alts:
-                    max_object_xref_id += 1
-
-                    query = insert(ObjectXrefUORM).values(
-                        object_xref_id=max_object_xref_id,
-                        ensembl_id=alt,
-                        ensembl_object_type=row.ensembl_object_type,
-                        xref_id=row.xref_id,
-                        linkage_annotation=row.linkage_annotation,
-                        linkage_type=row.linkage_type,
-                        ox_status=row.ox_status,
-                        unused_priority=row.unused_priority,
-                        master_xref_id=row.master_xref_id,
-                    )
-                    row_count = xref_dbi.execute(query).rowcount
 
-                    # Only add identity xref if object_xref was added successfully
-                    if row_count:
-                        added_count += 1
+            max_object_xref_id = xref_dbi.execute(
+                select(func.max(ObjectXrefUORM.object_xref_id))
+            ).scalar()
+            max_object_xref_id = int(max_object_xref_id)
 
-                        query = insert(IdentityXrefUORM).values(
-                            object_xref_id=max_object_xref_id,
-                            query_identity=row.query_identity,
-                            target_identity=row.target_identity,
-                            hit_start=row.hit_start,
-                            hit_end=row.hit_end,
-                            translation_start=row.translation_start,
-                            translation_end=row.translation_end,
-                            cigar_line=row.cigar_line,
-                            score=row.score,
-                            evalue=row.evalue,
-                        )
-                        xref_dbi.execute(query)
-                    else:
-                        ignored += 1
+            if not max_object_xref_id:
+                raise LookupError("Problem getting max object_xref_id")
 
-        logging.info(f"Added {added_count} new mappings and ignored {ignored}")
+            added_count, ignored = 0, 0
 
-        if self.unlinked_entries(verbose, xref_dbi):
-            raise ValueError("Problems found after process_alt_alleles")
+            # Copy the xref data related to the reference gene onto the alt alleles
+            for ref_gene, alts in ref_to_alts.items():
+                # Get object and identity xref data related to the reference gene
+                query = (
+                    select(ObjectXrefUORM, IdentityXrefUORM)
+                    .outerjoin(
+                        IdentityXrefUORM,
+                        IdentityXrefUORM.object_xref_id == ObjectXrefUORM.object_xref_id,
+                    )
+                    .where(
+                        XrefUORM.source_id == SourceUORM.source_id,
+                        ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                        ObjectXrefUORM.ensembl_id == ref_gene,
+                        ObjectXrefUORM.ox_status == "DUMP_OUT",
+                        ObjectXrefUORM.ensembl_object_type == "Gene",
+                        SourceUORM.name.in_(gene_specific_list),
+                    )
+                )
+                for row in xref_dbi.execute(query).mappings().all():
+                    for alt in alts:
+                        max_object_xref_id += 1
 
-        xref_dbi.close()
+                        query = insert(ObjectXrefUORM).values(
+                            object_xref_id=max_object_xref_id,
+                            ensembl_id=alt,
+                            ensembl_object_type=row.ensembl_object_type,
+                            xref_id=row.xref_id,
+                            linkage_annotation=row.linkage_annotation,
+                            linkage_type=row.linkage_type,
+                            ox_status=row.ox_status,
+                            unused_priority=row.unused_priority,
+                            master_xref_id=row.master_xref_id,
+                        )
+                        row_count = xref_dbi.execute(query).rowcount
+
+                        # Only add identity xref if object_xref was added successfully
+                        if row_count:
+                            added_count += 1
+
+                            query = insert(IdentityXrefUORM).values(
+                                object_xref_id=max_object_xref_id,
+                                query_identity=row.query_identity,
+                                target_identity=row.target_identity,
+                                hit_start=row.hit_start,
+                                hit_end=row.hit_end,
+                                translation_start=row.translation_start,
+                                translation_end=row.translation_end,
+                                cigar_line=row.cigar_line,
+                                score=row.score,
+                                evalue=row.evalue,
+                            )
+                            xref_dbi.execute(query)
+                        else:
+                            ignored += 1
+
+            logging.info(f"Added {added_count} new mappings and ignored {ignored}")
+
+            if self.unlinked_entries(verbose, xref_dbi):
+                raise ValueError("Problems found after process_alt_alleles")
 
         self.update_process_status("alt_alleles_processed")
 
     def get_alt_allele_hashes(self, dbi: Connection) -> Tuple[Dict[int, int], Dict[int, List[int]]]:
-        alt_to_ref, ref_to_alts = {}, {}
-        last_alt_allele, ref_gene = 0, None
+        alt_to_ref = {}
+        ref_to_alts = {}
+        last_alt_allele = None
+        ref_gene = None
 
         query = select(
             AltAlleleUORM.alt_allele_id,
             AltAlleleUORM.gene_id,
             AltAlleleUORM.is_reference,
         ).order_by(AltAlleleUORM.alt_allele_id, AltAlleleUORM.is_reference.desc())
+
         for row in dbi.execute(query).mappings().all():
             if row.alt_allele_id != last_alt_allele:
                 # Use the first non-reference gene if there is no reference gene in an alt_allele
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py
index 0dcbfdff4..3f30fcb7b 100644
--- a/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessPaired.py
@@ -14,8 +14,22 @@
 
 """Mapper module for processing paired xrefs."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from sqlalchemy import select, update, insert
+from sqlalchemy.orm import aliased
+from sqlalchemy.engine import Connection
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneTranscriptTranslation as GeneTranscriptTranslationORM,
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    Pairs as PairsORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class ProcessPaired(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -28,7 +42,6 @@ def process(self) -> None:
 
         xref_dbi = self.xref().connect()
 
-        object_xref_id = None
         change = {
             "translation object xrefs added": 0,
             "translation object xrefs removed": 0,
@@ -67,6 +80,7 @@ def process(self) -> None:
         )
         for row in xref_dbi.execute(query).mappings().all():
             # Check if translation is linked to the paired RefSeq peptide
+            transl_object_xref_id = None
             if row.translation_id:
                 query = (
                     select(ObjectXrefUORM.object_xref_id, ObjectXrefUORM.xref_id)
@@ -83,8 +97,6 @@ def process(self) -> None:
                 if result.rowcount > 0:
                     object_xref_row = result.mappings().all()[0]
                     transl_object_xref_id = object_xref_row.object_xref_id
-                else:
-                    transl_object_xref_id = None
 
                 # If it's already linked we don't have to do anything
                 if not transl_object_xref_id:
@@ -167,10 +179,7 @@ def process(self) -> None:
         )
         for row in xref_dbi.execute(query).mappings().all():
             if RefSeq_pep_translation.get(row.accession):
-                found = 0
-                for tr_id in RefSeq_pep_translation[row.accession]:
-                    if tr_id == row.ensembl_id:
-                        found = 1
+                found = any(tr_id == row.ensembl_id for tr_id in RefSeq_pep_translation[row.accession])
 
                 if not found:
                     # This translations's transcript is not matched with the paired RefSeq_mRNA%,
@@ -194,23 +203,18 @@ def process(self) -> None:
         self.update_process_status("processed_pairs")
 
     def process_dependents(self, translation_object_xref_id: int, translation_id: int, transcript_id: int, dbi: Connection) -> None:
-        master_object_xrefs = []
-        new_master_object_xref_id = None
-        master_object_xref_ids = {}
-
-        master_object_xrefs.append(translation_object_xref_id)
-        master_object_xref_ids[translation_object_xref_id] = 1
+        master_object_xrefs = [translation_object_xref_id]
+        master_object_xref_ids = set(master_object_xrefs)
 
         while master_object_xrefs:
             master_object_xref_id = master_object_xrefs.pop()
-            dependent_object_xref_id = None
 
             MasterObjectXref = aliased(ObjectXrefUORM)
             DependentObjectXref = aliased(ObjectXrefUORM)
-
             MasterXref = aliased(XrefUORM)
             DependentXref = aliased(XrefUORM)
 
+            # Process dependent xrefs for Translation
             query = select(DependentObjectXref.object_xref_id.distinct()).where(
                 DependentXref.xref_id == DependentXrefUORM.dependent_xref_id,
                 MasterXref.xref_id == DependentXrefUORM.master_xref_id,
@@ -225,10 +229,11 @@ def process_dependents(self, translation_object_xref_id: int, translation_id: in
             for row in dbi.execute(query).mappings().all():
                 self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi)
 
-                if not master_object_xref_ids.get(row.object_xref_id):
-                    master_object_xref_ids[row.object_xref_id] = 1
+                if row.object_xref_id not in master_object_xref_ids:
+                    master_object_xref_ids.add(row.object_xref_id)
                     master_object_xrefs.append(row.object_xref_id)
 
+            # Process dependent xrefs for Transcript
             query = select(DependentObjectXref.object_xref_id.distinct()).where(
                 DependentXref.xref_id == DependentXrefUORM.dependent_xref_id,
                 MasterXref.xref_id == DependentXrefUORM.master_xref_id,
@@ -243,6 +248,6 @@ def process_dependents(self, translation_object_xref_id: int, translation_id: in
             for row in dbi.execute(query).mappings().all():
                 self.update_object_xref_status(row.object_xref_id, "MULTI_DELETE", dbi)
 
-                if not master_object_xref_ids.get(row.object_xref_id):
-                    master_object_xref_ids[row.object_xref_id] = 1
+                if row.object_xref_id not in master_object_xref_ids:
+                    master_object_xref_ids.add(row.object_xref_id)
                     master_object_xrefs.append(row.object_xref_id)
diff --git a/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py
index ba212ddf6..8f418d271 100644
--- a/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py
+++ b/src/python/ensembl/production/xrefs/mappers/ProcessPriorities.py
@@ -14,8 +14,21 @@
 
 """Mapper module for processing xref priorities."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from typing import List
+from sqlalchemy import select, update, insert, delete, desc
+from sqlalchemy.engine import Connection
+
+from ensembl.xrefs.xref_update_db_model import (
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    Synonym as SynonymORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class ProcessPriorities(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -51,7 +64,7 @@ def process(self) -> None:
         # Now ALL object_xrefs have an identity_xref
         # So we can do a straight join and treat all info_types the same way
         for name in names:
-            last_acc, last_name, best_xref_id, last_xref_id, seen = "", "", None, 0, 0
+            last_acc, last_name, best_xref_id, last_xref_id, seen = "", "", None, 0, False
             best_ensembl_id, gone = [], []
 
             query = (
@@ -147,9 +160,7 @@ def process(self) -> None:
                                 )
 
                                 # Copy synonyms across if they are missing
-                                query = select(SynonymORM.synonym).where(
-                                    SynonymORM.xref_id == row.xref_id
-                                )
+                                query = select(SynonymORM.synonym).where(SynonymORM.xref_id == row.xref_id)
                                 for synonym_row in (
                                     xref_dbi.execute(query).mappings().all()
                                 ):
@@ -179,12 +190,11 @@ def process(self) -> None:
                         best_ensembl_id.append(row.ensembl_id)
 
                     # Best priority failed so another one now found so set dumped
-                    if len(gone) > 0:
-                        if last_name == row.accession:
-                            for x_id in gone:
-                                self.update_xref_dumped(
-                                    x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
-                                )
+                    if gone and last_name == row.accession:
+                        for x_id in gone:
+                            self.update_xref_dumped(
+                                x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
+                            )
                 else:
                     # New xref_id
                     if row.ox_status == "DUMP_OUT":
@@ -192,7 +202,7 @@ def process(self) -> None:
                         best_xref_id = row.xref_id
                         best_ensembl_id = [row.ensembl_id]
 
-                        if len(gone) > 0 and last_name == row.accession:
+                        if gone and last_name == row.accession:
                             for x_id in gone:
                                 self.update_xref_dumped(
                                     x_id, "NO_DUMP_ANOTHER_PRIORITY", xref_dbi
@@ -226,7 +236,7 @@ def get_priority_names(self, dbi: Connection) -> List[str]:
         for row in dbi.execute(query).mappings().all():
             if row.name == last_name and not seen.get(row.name):
                 names.append(row.name)
-                seen[row.name] = 1
+                seen[row.name] = True
             last_name = row.name
 
         return names
@@ -238,7 +248,7 @@ def update_xref_dumped(self, xref_id: int, dumped: str, dbi: Connection) -> None
 
     def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, dbi: Connection) -> None:
         master_xrefs = [old_master_xref_id]
-        recursive = 0
+        recursive = False
 
         # Create a hash of all possible mappings for this accession
         ensembl_ids = {}
@@ -296,7 +306,7 @@ def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, d
             for row in dbi.execute(query).mappings().all():
                 # Remove all mappings to low priority xrefs
                 # Then delete any leftover identity xrefs of it
-                for ensembl_id in old_ensembl_ids.get(row.ensembl_object_type):
+                for ensembl_id in old_ensembl_ids.get(row.ensembl_object_type, []):
                     self._detach_object_xref(
                         xref_id,
                         row.dependent_xref_id,
@@ -319,7 +329,7 @@ def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, d
                     )
 
                 # Loop through all chosen (best) ensembl ids mapped to priority xref, and connect them with object_xrefs
-                for ensembl_id in ensembl_ids.get(row.ensembl_object_type):
+                for ensembl_id in ensembl_ids.get(row.ensembl_object_type, []):
                     # Add new object_xref for each best_ensembl_id
                     dbi.execute(
                         insert(ObjectXrefUORM)
@@ -357,7 +367,7 @@ def process_dependents(self, old_master_xref_id: int, new_master_xref_id: int, d
                 if row.dependent_xref_id != xref_id:
                     master_xrefs.append(row.dependent_xref_id)
 
-            recursive = 1
+            recursive = True
 
     def _detach_object_xref(self, xref_id: int, dependent_xref_id: int, object_type: str, ensembl_id: int, dbi: Connection) -> None:
         # Drop all the identity and go xrefs for the dependents of an xref
diff --git a/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py
index 473af5a69..6d5f682ae 100644
--- a/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/RNACentralMapper.py
@@ -14,8 +14,7 @@
 
 """Mapper module for processing RNACentral xref data."""
 
-from ensembl.production.xrefs.mappers.ChecksumMapper import *
-
+from ensembl.production.xrefs.mappers.ChecksumMapper import ChecksumMapper
 
 class RNACentralMapper(ChecksumMapper):
     def target(self) -> str:
diff --git a/src/python/ensembl/production/xrefs/mappers/TestMappings.py b/src/python/ensembl/production/xrefs/mappers/TestMappings.py
index 4511741d1..d3d0a319a 100644
--- a/src/python/ensembl/production/xrefs/mappers/TestMappings.py
+++ b/src/python/ensembl/production/xrefs/mappers/TestMappings.py
@@ -14,8 +14,30 @@
 
 """Mapper module for running validity checks on xref data."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from sqlalchemy import select, func, text
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    ObjectXref as ObjectXrefCORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM
+)
+
+from ensembl.xrefs.xref_update_db_model import (
+    GeneStableId as GeneStableIdORM,
+    TranscriptStableId as TranscriptStableIdORM,
+    TranslationStableId as TranslationStableIdORM,
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    GeneDirectXref as GeneDirectXrefORM,
+    TranscriptDirectXref as TranscriptDirectXrefORM,
+    TranslationDirectXref as TranslationDirectXrefORM,
+    Synonym as SynonymORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class TestMappings(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -40,10 +62,10 @@ def direct_stable_id_check(self) -> int:
 
         total_warnings_count = 0
 
-        for object_type in ["gene", "transcript", "translation"]:
+        for object_type, tables in db_tables.items():
             warnings_count = 0
-            direct_table = db_tables[object_type]["direct"]
-            stable_id_table = db_tables[object_type]["stable_id"]
+            direct_table = tables["direct"]
+            stable_id_table = tables["stable_id"]
 
             query = (
                 select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count"))
@@ -57,7 +79,7 @@ def direct_stable_id_check(self) -> int:
                 .group_by(SourceUORM.name)
             )
             for row in xref_dbi.execute(query).mappings().all():
-                logging.warn(
+                logging.warning(
                     f"{row.name} has {row.count} invalid stable IDs in {object_type}_direct_xref"
                 )
                 warnings_count += 1
@@ -78,8 +100,15 @@ def xrefs_counts_check(self) -> int:
         core_count, xref_count = {}, {}
 
         # TO DO: sqlalchemy syntax -- can't figure out how to count 2 columns
-        xref_query = f'SELECT s.name, COUNT(DISTINCT x.xref_id, ox.ensembl_id) AS count FROM xref x, object_xref ox, source s WHERE ox.xref_id = x.xref_id AND x.source_id = s.source_id AND ox_status = "DUMP_OUT" GROUP BY s.name'
-        for row in xref_dbi.execute(text(xref_query)).mappings().all():
+        xref_query = text(
+            'SELECT s.name, COUNT(DISTINCT x.xref_id, ox.ensembl_id) AS count '
+            'FROM xref x '
+            'JOIN object_xref ox ON ox.xref_id = x.xref_id '
+            'JOIN source s ON x.source_id = s.source_id '
+            'WHERE ox_status = "DUMP_OUT" '
+            'GROUP BY s.name'
+        )
+        for row in xref_dbi.execute(xref_query).mappings().all():
             xref_count[row.name] = row.count
 
         query = (
@@ -87,10 +116,8 @@ def xrefs_counts_check(self) -> int:
                 ExternalDbORM.db_name,
                 func.count(ObjectXrefCORM.object_xref_id).label("count"),
             )
-            .where(
-                XrefCORM.xref_id == ObjectXrefCORM.xref_id,
-                XrefCORM.external_db_id == ExternalDbORM.external_db_id,
-            )
+            .join(XrefCORM, XrefCORM.xref_id == ObjectXrefCORM.xref_id)
+            .join(ExternalDbORM, XrefCORM.external_db_id == ExternalDbORM.external_db_id)
             .filter((XrefCORM.info_type == None) | (XrefCORM.info_type != "PROJECTION"))
             .group_by(ExternalDbORM.db_name)
         )
@@ -102,24 +129,24 @@ def xrefs_counts_check(self) -> int:
                 change = ((xref_count[row.db_name] - row.count) / row.count) * 100
 
                 if change > 5:
-                    logging.warn(
-                        f"{row.db_name} has increased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB"
+                    logging.warning(
+                        f"{row.db_name} has increased by {change:.2f}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB"
                     )
                     warnings_count += 1
                 elif change < -5:
-                    logging.warn(
-                        f"{row.db_name} has decreased by {change}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB"
+                    logging.warning(
+                        f"{row.db_name} has decreased by {change:.2f}%. It was {row.count} in the core DB, while it is {xref_count[row.db_name]} in the xref DB"
                     )
                     warnings_count += 1
             else:
-                logging.warn(
+                logging.warning(
                     f"{row.db_name} xrefs are not in the xref DB but {row.count} are in the core DB"
                 )
                 warnings_count += 1
 
         for name, count in xref_count.items():
             if not core_count.get(name):
-                logging.warn(
+                logging.warning(
                     f"{name} has {count} xrefs in the xref DB but none in the core DB"
                 )
                 warnings_count += 1
@@ -141,53 +168,52 @@ def name_change_check(self, official_name: str = None) -> int:
         xref_dbi = self.xref().connect()
         core_dbi = self.core().connect()
 
-        query = select(
-            XrefUORM.label, GeneStableIdORM.internal_id, GeneStableIdORM.stable_id
-        ).where(
-            XrefUORM.xref_id == ObjectXrefUORM.object_xref_id,
-            ObjectXrefUORM.ensembl_object_type == "Gene",
-            GeneStableIdORM.internal_id == ObjectXrefUORM.ensembl_id,
-            XrefUORM.source_id == SourceUORM.source_id,
-            SourceUORM.name.like(f"{official_name}_%"),
+        # Query to get new names and stable IDs
+        query = (
+            select(XrefUORM.label, GeneStableIdORM.internal_id, GeneStableIdORM.stable_id)
+            .join(ObjectXrefUORM, XrefUORM.xref_id == ObjectXrefUORM.object_xref_id)
+            .join(GeneStableIdORM, GeneStableIdORM.internal_id == ObjectXrefUORM.ensembl_id)
+            .join(SourceUORM, XrefUORM.source_id == SourceUORM.source_id)
+            .where(
+                ObjectXrefUORM.ensembl_object_type == "Gene",
+                SourceUORM.name.like(f"{official_name}_%")
+            )
         )
         for row in xref_dbi.execute(query).mappings().all():
             new_name[row.internal_id] = row.label
             id_to_stable_id[row.internal_id] = row.stable_id
 
+        # Query to get aliases
         query = (
             select(XrefUORM.label, SynonymORM.synonym)
+            .join(SynonymORM, XrefUORM.xref_id == SynonymORM.xref_id)
+            .join(SourceUORM, XrefUORM.source_id == SourceUORM.source_id)
             .where(
-                XrefUORM.xref_id == SynonymORM.xref_id,
-                XrefUORM.source_id == SourceUORM.source_id,
-            )
-            .filter(
-                (SourceUORM.name.like(f"{official_name}_%"))
-                | (SourceUORM.name.like("EntrezGene"))
+                (SourceUORM.name.like(f"{official_name}_%")) | (SourceUORM.name.like("EntrezGene"))
             )
         )
         for row in xref_dbi.execute(query).mappings().all():
             alias[row.synonym] = row.label
 
-        query = select(XrefCORM.display_label, GeneORM.gene_id).where(
-            XrefCORM.xref_id == GeneORM.display_xref_id,
-            GeneORM.biotype == "protein_coding",
+        # Query to get current display labels
+        query = (
+            select(XrefCORM.display_label, GeneORM.gene_id)
+            .join(GeneORM, XrefCORM.xref_id == GeneORM.display_xref_id)
+            .where(GeneORM.biotype == "protein_coding")
         )
         for row in core_dbi.execute(query).mappings().all():
             if new_name.get(row.gene_id):
                 total_count += 1
 
             if new_name.get(row.gene_id) and new_name[row.gene_id] != row.display_label:
-                if (
-                    not alias.get(row.display_label)
-                    or alias.get(row.display_label) != new_name[row.gene_id]
-                ):
-                    logging.warn(
+                if not alias.get(row.display_label) or alias.get(row.display_label) != new_name[row.gene_id]:
+                    logging.warning(
                         f"gene ID ({row.gene_id}) {id_to_stable_id[row.gene_id]} new = {new_name[row.gene_id]} old = {row.display_label}"
                     )
                     warnings_count += 1
 
         if total_count:
-            logging.warn(
+            logging.warning(
                 f"{warnings_count} entries with different names out of {total_count} protein coding gene comparisons"
             )
 
diff --git a/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py
index f518303bb..2b8a97b29 100644
--- a/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/UniParcMapper.py
@@ -14,8 +14,7 @@
 
 """Mapper module for processing UniParc xref data."""
 
-from ensembl.production.xrefs.mappers.ChecksumMapper import *
-
+from ensembl.production.xrefs.mappers.ChecksumMapper import ChecksumMapper
 
 class UniParcMapper(ChecksumMapper):
     def target(self) -> str:
diff --git a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
index be634d870..c95ee7716 100644
--- a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
+++ b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
@@ -14,8 +14,40 @@
 
 """Mapper module for loading xref data into the core DB."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+import re
+from datetime import datetime
+from sqlalchemy import select, func, update, delete
+from sqlalchemy.dialects.mysql import insert
+from sqlalchemy.orm import sessionmaker, aliased, Session
+from sqlalchemy.engine import Connection
+from sqlalchemy.exc import SQLAlchemyError
+from typing import Any, Dict
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    ObjectXref as ObjectXrefCORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    UnmappedObject as UnmappedObjectORM,
+    Analysis as AnalysisORM,
+    OntologyXref as OntologyXrefORM,
+    ExternalSynonym as ExternalSynonymORM,
+    DependentXref as DependentXrefCORM,
+    IdentityXref as IdentityXrefCORM
+)
+
+from ensembl.xrefs.xref_update_db_model import (
+    ObjectXref as ObjectXrefUORM,
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    IdentityXref as IdentityXrefUORM,
+    DependentXref as DependentXrefUORM,
+    Synonym as SynonymORM,
+    PrimaryXref as PrimaryXrefORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class XrefLoader(BasicMapper):
     def __init__(self, mapper: BasicMapper) -> None:
@@ -33,7 +65,8 @@ def update(self, species_name: str) -> None:
         self.delete_projection_data(core_dbi)
 
         # Get the source IDs of relevant external DBs
-        name_to_external_db_id, source_id_to_external_db_id = {}, {}
+        name_to_external_db_id = {}
+        source_id_to_external_db_id = {}
 
         query = select(ExternalDbORM.external_db_id, ExternalDbORM.db_name)
         for row in core_dbi.execute(query).mappings().all():
@@ -46,15 +79,11 @@ def update(self, species_name: str) -> None:
         )
         for row in xref_dbi.execute(query).mappings().all():
             if name_to_external_db_id.get(row.name):
-                source_id_to_external_db_id[row.source_id] = name_to_external_db_id[
-                    row.name
-                ]
+                source_id_to_external_db_id[row.source_id] = name_to_external_db_id[row.name]
             elif re.search(r"notransfer$", row.name):
                 continue
             else:
-                raise LookupError(
-                    f"Could not find {row.name} in external_db table in the core DB"
-                )
+                raise LookupError(f"Could not find {row.name} in external_db table in the core DB")
 
         # Reset dumped field in case module is running again
         xref_dbi.execute(
@@ -64,29 +93,16 @@ def update(self, species_name: str) -> None:
         )
 
         # Delete existing xrefs in core DB (only from relevant sources)
-        self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi, core_dbi)
+        self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi)
 
         # Get the offsets for xref and object_xref tables
-        # This is used to track the xrefs whe mapping onto the core DB
-        xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar()
-        object_xref_offset = core_dbi.execute(
-            select(func.max(ObjectXrefCORM.object_xref_id))
-        ).scalar()
+        xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() or 0
+        object_xref_offset = core_dbi.execute(select(func.max(ObjectXrefCORM.object_xref_id))).scalar() or 0
 
-        if not xref_offset:
-            xref_offset = 0
-        else:
-            xref_offset = int(xref_offset)
         self.add_meta_pair("xref_offset", xref_offset)
-        if not object_xref_offset:
-            object_xref_offset = 0
-        else:
-            object_xref_offset = int(object_xref_offset)
         self.add_meta_pair("object_xref_offset", object_xref_offset)
 
-        logging.info(
-            f"DB offsets: xref={xref_offset}, object_xref={object_xref_offset}"
-        )
+        logging.info(f"DB offsets: xref={xref_offset}, object_xref={object_xref_offset}")
 
         # Get analysis IDs
         analysis_ids = self.get_analysis(core_dbi)
@@ -110,8 +126,6 @@ def update(self, species_name: str) -> None:
             .order_by(XrefUORM.xref_id)
         )
 
-        #### TO DO: transaction
-
         # Get source info from xref DB
         query = (
             select(
@@ -133,9 +147,7 @@ def update(self, species_name: str) -> None:
             # We only care about specific sources
             if not name_to_external_db_id.get(source_row.name):
                 continue
-            logging.info(
-                f"Updating source '{source_row.name}' ({source_row.source_id}) in core"
-            )
+            logging.info(f"Updating source '{source_row.name}' ({source_row.source_id}) in core")
 
             where_from = source_row.priority_description
             if where_from:
@@ -144,319 +156,294 @@ def update(self, species_name: str) -> None:
             external_id = name_to_external_db_id[source_row.name]
             xref_list = []
 
-            if (
-                source_row.info_type == "DIRECT"
-                or source_row.info_type == "INFERRED_PAIR"
-                or source_row.info_type == "MISC"
-            ):
-                count, last_xref_id = 0, 0
-
-                # Get all direct, inferred pair and misc xrefs from intermediate DB
-                query = xref_object_identity_query.where(
-                    XrefUORM.source_id == source_row.source_id,
-                    XrefUORM.info_type == source_row.info_type,
-                )
-                for xref_row in xref_dbi.execute(query).mappings().all():
-                    xref_id = int(xref_row.xref_id)
-                    object_xref_id = int(xref_row.object_xref_id)
-
-                    if last_xref_id != xref_id:
-                        xref_list.append(xref_id)
-                        count += 1
-
-                        # Add xref into core DB
-                        info_text = xref_row.info_text
-                        if not info_text:
-                            info_text = where_from
-                        xref_args = {
-                            "xref_id": xref_id,
-                            "accession": xref_row.accession,
-                            "external_db_id": external_id,
-                            "label": xref_row.label,
-                            "description": xref_row.description,
-                            "version": xref_row.version,
-                            "info_type": xref_row.info_type,
-                            "info_text": info_text,
-                        }
-                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
-                        last_xref_id = xref_id
-
-                    # Add object xref into core DB
-                    object_xref_args = {
-                        "object_xref_id": object_xref_id,
-                        "ensembl_id": xref_row.ensembl_id,
-                        "ensembl_type": xref_row.ensembl_object_type,
-                        "xref_id": xref_id + xref_offset,
-                        "analysis_id": analysis_ids[xref_row.ensembl_object_type],
-                    }
-                    object_xref_id = self.add_object_xref(
-                        object_xref_offset, object_xref_args, core_dbi
-                    )
+            Session = sessionmaker(bind=self.core().execution_options(isolation_level="READ COMMITTED"))
+            with Session.begin() as session:
+                try:
+                    if source_row.info_type in ["DIRECT", "INFERRED_PAIR", "MISC"]:
+                        count, last_xref_id = 0, 0
+
+                        # Get all direct, inferred pair and misc xrefs from intermediate DB
+                        query = xref_object_identity_query.where(
+                            XrefUORM.source_id == source_row.source_id,
+                            XrefUORM.info_type == source_row.info_type,
+                        )
+                        for xref_row in xref_dbi.execute(query).mappings().all():
+                            xref_id = int(xref_row.xref_id)
+                            object_xref_id = int(xref_row.object_xref_id)
+
+                            if last_xref_id != xref_id:
+                                xref_list.append(xref_id)
+                                count += 1
+
+                                # Add xref into core DB
+                                info_text = xref_row.info_text or where_from
+                                xref_args = {
+                                    "xref_id": xref_id,
+                                    "accession": xref_row.accession,
+                                    "external_db_id": external_id,
+                                    "label": xref_row.label,
+                                    "description": xref_row.description,
+                                    "version": xref_row.version,
+                                    "info_type": xref_row.info_type,
+                                    "info_text": info_text,
+                                }
+                                xref_id = self.add_xref(xref_offset, xref_args, session)
+                                last_xref_id = xref_id
+
+                            # Add object xref into core DB
+                            object_xref_args = {
+                                "object_xref_id": object_xref_id,
+                                "ensembl_id": xref_row.ensembl_id,
+                                "ensembl_type": xref_row.ensembl_object_type,
+                                "xref_id": xref_id + xref_offset,
+                                "analysis_id": analysis_ids[xref_row.ensembl_object_type],
+                            }
+                            object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session)
+
+                            # Add identity xref into core DB
+                            if xref_row.translation_start:
+                                query = (
+                                    insert(IdentityXrefCORM)
+                                    .values(
+                                        object_xref_id=object_xref_id + object_xref_offset,
+                                        xref_identity=xref_row.query_identity,
+                                        ensembl_identity=xref_row.target_identity,
+                                        xref_start=xref_row.hit_start,
+                                        xref_end=xref_row.hit_end,
+                                        ensembl_start=xref_row.translation_start,
+                                        ensembl_end=xref_row.translation_end,
+                                        cigar_line=xref_row.cigar_line,
+                                        score=xref_row.score,
+                                        evalue=xref_row.evalue,
+                                    )
+                                    .prefix_with("IGNORE")
+                                )
+                                session.execute(query)
 
-                    # Add identity xref into core DB
-                    if xref_row.translation_start:
+                        logging.info(f"\tLoaded {count} {source_row.info_type} xrefs for '{species_name}'")
+                    elif source_row.info_type == "CHECKSUM":
+                        count, last_xref_id = 0, 0
+
+                        # Get all checksum xrefs from intermediate DB
+                        query = xref_object_query.where(
+                            XrefUORM.source_id == source_row.source_id,
+                            XrefUORM.info_type == source_row.info_type,
+                        )
+                        for xref_row in xref_dbi.execute(query).mappings().all():
+                            xref_id = int(xref_row.xref_id)
+                            object_xref_id = int(xref_row.object_xref_id)
+
+                            if last_xref_id != xref_id:
+                                xref_list.append(xref_id)
+                                count += 1
+
+                                # Add xref into core DB
+                                info_text = xref_row.info_text or where_from
+                                xref_args = {
+                                    "xref_id": xref_id,
+                                    "accession": xref_row.accession,
+                                    "external_db_id": external_id,
+                                    "label": xref_row.label,
+                                    "description": xref_row.description,
+                                    "version": xref_row.version,
+                                    "info_type": xref_row.info_type,
+                                    "info_text": info_text,
+                                }
+                                xref_id = self.add_xref(xref_offset, xref_args, session)
+                                last_xref_id = xref_id
+
+                            # Add object xref into core DB
+                            object_xref_args = {
+                                "object_xref_id": object_xref_id,
+                                "ensembl_id": xref_row.ensembl_id,
+                                "ensembl_type": xref_row.ensembl_object_type,
+                                "xref_id": xref_id + xref_offset,
+                                "analysis_id": analysis_ids["checksum"],
+                            }
+                            object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session)
+
+                        logging.info(f"\tLoaded {count} CHECKSUM xrefs for '{species_name}'")
+                    elif source_row.info_type == "DEPENDENT":
+                        count, last_xref_id, last_ensembl_id, master_error_count = 0, 0, 0, 0
+                        master_problems = []
+
+                        # Get all dependent xrefs from intermediate DB
+                        MasterXref = aliased(XrefUORM)
                         query = (
-                            insert(IdentityXrefCORM)
-                            .values(
-                                object_xref_id=object_xref_id + object_xref_offset,
-                                xref_identity=xref_row.query_identity,
-                                ensembl_identity=xref_row.target_identity,
-                                xref_start=xref_row.hit_start,
-                                xref_end=xref_row.hit_end,
-                                ensembl_start=xref_row.translation_start,
-                                ensembl_end=xref_row.translation_end,
-                                cigar_line=xref_row.cigar_line,
-                                score=xref_row.score,
-                                evalue=xref_row.evalue,
+                            select(XrefUORM, ObjectXrefUORM)
+                            .where(
+                                ObjectXrefUORM.ox_status == "DUMP_OUT",
+                                ObjectXrefUORM.xref_id == XrefUORM.xref_id,
+                                ObjectXrefUORM.master_xref_id == MasterXref.xref_id,
+                                MasterXref.source_id == SourceUORM.source_id,
+                                XrefUORM.source_id == source_row.source_id,
+                                XrefUORM.info_type == "DEPENDENT",
                             )
-                            .prefix_with("IGNORE")
+                            .order_by(XrefUORM.xref_id, ObjectXrefUORM.ensembl_id, SourceUORM.ordered)
                         )
-                        core_dbi.execute(query)
-
-                logging.info(
-                    f"\tLoaded {count} {source_row.info_type} xrefs for '{species_name}'"
-                )
-            elif source_row.info_type == "CHECKSUM":
-                count, last_xref_id = 0, 0
+                        for xref_row in xref_dbi.execute(query).mappings().all():
+                            xref_id = int(xref_row.xref_id)
+                            object_xref_id = int(xref_row.object_xref_id)
+
+                            if last_xref_id != xref_id:
+                                xref_list.append(xref_id)
+                                count += 1
+
+                                # Add xref into core DB
+                                label = xref_row.label or xref_row.accession
+                                info_text = xref_row.info_text or where_from
+                                xref_args = {
+                                    "xref_id": xref_id,
+                                    "accession": xref_row.accession,
+                                    "external_db_id": external_id,
+                                    "label": label,
+                                    "description": xref_row.description,
+                                    "version": xref_row.version,
+                                    "info_type": xref_row.info_type,
+                                    "info_text": info_text,
+                                }
+                                xref_id = self.add_xref(xref_offset, xref_args, session)
+
+                            if last_xref_id != xref_id or last_ensembl_id != xref_row.ensembl_id:
+                                # Add object xref into core DB
+                                object_xref_args = {
+                                    "object_xref_id": object_xref_id,
+                                    "ensembl_id": xref_row.ensembl_id,
+                                    "ensembl_type": xref_row.ensembl_object_type,
+                                    "xref_id": xref_id + xref_offset,
+                                    "analysis_id": analysis_ids[xref_row.ensembl_object_type],
+                                }
+                                object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session)
+
+                                if xref_row.master_xref_id:
+                                    # Add dependent xref into core DB
+                                    session.execute(
+                                        insert(DependentXrefCORM)
+                                        .values(
+                                            object_xref_id=object_xref_id + object_xref_offset,
+                                            master_xref_id=xref_row.master_xref_id + xref_offset,
+                                            dependent_xref_id=xref_id + xref_offset,
+                                        )
+                                        .prefix_with("IGNORE")
+                                    )
+                                else:
+                                    if master_error_count < 10:
+                                        master_problems.append(xref_row.accession)
+                                    master_error_count += 1
+
+                            last_xref_id = xref_id
+                            last_ensembl_id = xref_row.ensembl_id
+
+                        if master_problems:
+                            logging.warning(
+                                f"For {source_row.name}, there were {master_error_count} problem master xrefs. Examples are: "
+                                + ", ".join(master_problems)
+                            )
 
-                # Get all checksum xrefs from intermediate DB
-                query = xref_object_query.where(
-                    XrefUORM.source_id == source_row.source_id,
-                    XrefUORM.info_type == source_row.info_type,
-                )
-                for xref_row in xref_dbi.execute(query).mappings().all():
-                    xref_id = int(xref_row.xref_id)
-                    object_xref_id = int(xref_row.object_xref_id)
-
-                    if last_xref_id != xref_id:
-                        xref_list.append(xref_id)
-                        count += 1
-
-                        # Add xref into core DB
-                        info_text = xref_row.info_text
-                        if not info_text:
-                            info_text = where_from
-                        xref_args = {
-                            "xref_id": xref_id,
-                            "accession": xref_row.accession,
-                            "external_db_id": external_id,
-                            "label": xref_row.label,
-                            "description": xref_row.description,
-                            "version": xref_row.version,
-                            "info_type": xref_row.info_type,
-                            "info_text": info_text,
-                        }
-                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
-                        last_xref_id = xref_id
-
-                    # Add object xref into core DB
-                    object_xref_args = {
-                        "object_xref_id": object_xref_id,
-                        "ensembl_id": xref_row.ensembl_id,
-                        "ensembl_type": xref_row.ensembl_object_type,
-                        "xref_id": xref_id + xref_offset,
-                        "analysis_id": analysis_ids["checksum"],
-                    }
-                    object_xref_id = self.add_object_xref(
-                        object_xref_offset, object_xref_args, core_dbi
-                    )
+                        logging.info(f"\tLoaded {count} DEPENDENT xrefs for '{species_name}'")
+                    elif source_row.info_type == "SEQUENCE_MATCH":
+                        count, last_xref_id = 0, 0
 
-                logging.info(f"\tLoaded {count} CHECKSUM xrefs for '{species_name}'")
-            elif source_row.info_type == "DEPENDENT":
-                count, last_xref_id, last_ensembl_id, master_error_count = 0, 0, 0, 0
-                master_problems = []
-
-                # Get all dependent xrefs from intermediate DB
-                MasterXref = aliased(XrefUORM)
-                query = (
-                    select(XrefUORM, ObjectXrefUORM)
-                    .where(
-                        ObjectXrefUORM.ox_status == "DUMP_OUT",
-                        ObjectXrefUORM.xref_id == XrefUORM.xref_id,
-                        ObjectXrefUORM.master_xref_id == MasterXref.xref_id,
-                        MasterXref.source_id == SourceUORM.source_id,
-                        XrefUORM.source_id == source_row.source_id,
-                        XrefUORM.info_type == "DEPENDENT",
-                    )
-                    .order_by(
-                        XrefUORM.xref_id, ObjectXrefUORM.ensembl_id, SourceUORM.ordered
-                    )
-                )
-                for xref_row in xref_dbi.execute(query).mappings().all():
-                    xref_id = int(xref_row.xref_id)
-                    object_xref_id = int(xref_row.object_xref_id)
-
-                    if last_xref_id != xref_id:
-                        xref_list.append(xref_id)
-                        count += 1
-
-                        # Add xref into core DB
-                        label = xref_row.label
-                        if not label:
-                            label = xref_row.accession
-                        info_text = xref_row.info_text
-                        if not info_text:
-                            info_text = where_from
-                        xref_args = {
-                            "xref_id": xref_id,
-                            "accession": xref_row.accession,
-                            "external_db_id": external_id,
-                            "label": label,
-                            "description": xref_row.description,
-                            "version": xref_row.version,
-                            "info_type": xref_row.info_type,
-                            "info_text": info_text,
-                        }
-                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
-
-                    if (
-                        last_xref_id != xref_id
-                        or last_ensembl_id != xref_row.ensembl_id
-                    ):
-                        # Add object xref into core DB
-                        object_xref_args = {
-                            "object_xref_id": object_xref_id,
-                            "ensembl_id": xref_row.ensembl_id,
-                            "ensembl_type": xref_row.ensembl_object_type,
-                            "xref_id": xref_id + xref_offset,
-                            "analysis_id": analysis_ids[xref_row.ensembl_object_type],
-                        }
-                        object_xref_id = self.add_object_xref(
-                            object_xref_offset, object_xref_args, core_dbi
+                        # Get all direct, inferred pair and misc xrefs from intermediate DB
+                        query = xref_object_identity_query.where(
+                            XrefUORM.source_id == source_row.source_id,
+                            XrefUORM.info_type == source_row.info_type,
                         )
-
-                        if xref_row.master_xref_id:
-                            # Add dependent xref into core DB
-                            core_dbi.execute(
-                                insert(DependentXrefCORM)
+                        for xref_row in xref_dbi.execute(query).mappings().all():
+                            xref_id = int(xref_row.xref_id)
+                            object_xref_id = int(xref_row.object_xref_id)
+
+                            if last_xref_id != xref_id:
+                                xref_list.append(xref_id)
+                                count += 1
+
+                                # Add xref into core DB
+                                info_text = xref_row.info_text or where_from
+                                xref_args = {
+                                    "xref_id": xref_id,
+                                    "accession": xref_row.accession,
+                                    "external_db_id": external_id,
+                                    "label": xref_row.label,
+                                    "description": xref_row.description,
+                                    "version": xref_row.version,
+                                    "info_type": xref_row.info_type,
+                                    "info_text": info_text,
+                                }
+                                xref_id = self.add_xref(xref_offset, xref_args, session)
+                                last_xref_id = xref_id
+
+                            # Add object xref into core DB
+                            object_xref_args = {
+                                "object_xref_id": object_xref_id,
+                                "ensembl_id": xref_row.ensembl_id,
+                                "ensembl_type": xref_row.ensembl_object_type,
+                                "xref_id": xref_id + xref_offset,
+                                "analysis_id": analysis_ids[xref_row.ensembl_object_type],
+                            }
+                            object_xref_id = self.add_object_xref(object_xref_offset, object_xref_args, session)
+
+                            # Add identity xref into core DB
+                            query = (
+                                insert(IdentityXrefCORM)
                                 .values(
                                     object_xref_id=object_xref_id + object_xref_offset,
-                                    master_xref_id=xref_row.master_xref_id
-                                    + xref_offset,
-                                    dependent_xref_id=xref_id + xref_offset,
+                                    xref_identity=xref_row.query_identity,
+                                    ensembl_identity=xref_row.target_identity,
+                                    xref_start=xref_row.hit_start,
+                                    xref_end=xref_row.hit_end,
+                                    ensembl_start=xref_row.translation_start,
+                                    ensembl_end=xref_row.translation_end,
+                                    cigar_line=xref_row.cigar_line,
+                                    score=xref_row.score,
+                                    evalue=xref_row.evalue,
                                 )
                                 .prefix_with("IGNORE")
                             )
-                        else:
-                            if master_error_count < 10:
-                                master_problems.append(xref_row.accession)
+                            session.execute(query)
 
-                            master_error_count += 1
+                        logging.info(f"\tLoaded {count} SEQUENCE_MATCH xrefs for '{species_name}'")
+                    else:
+                        logging.debug(f"\tPROBLEM: what type is {source_row.info_type}")
 
-                    last_xref_id = xref_id
-                    last_ensembl_id = xref_row.ensembl_id
-
-                if len(master_problems) > 0:
-                    logging.warn(
-                        f"For {source_row.name}, there were {master_error_count} problem master xrefs. Examples are: "
-                        + ", ".join(master_problems)
-                    )
+                    # Transfer synonym data
+                    if xref_list:
+                        syn_count = 0
 
-                logging.info(f"\tLoaded {count} DEPENDENT xrefs for '{species_name}'")
-            elif source_row.info_type == "SEQUENCE_MATCH":
-                count, last_xref_id = 0, 0
-
-                # Get all direct, inferred pair and misc xrefs from intermediate DB
-                query = xref_object_identity_query.where(
-                    XrefUORM.source_id == source_row.source_id,
-                    XrefUORM.info_type == source_row.info_type,
-                )
-                for xref_row in xref_dbi.execute(query).mappings().all():
-                    xref_id = int(xref_row.xref_id)
-                    object_xref_id = int(xref_row.object_xref_id)
-
-                    if last_xref_id != xref_id:
-                        xref_list.append(xref_id)
-                        count += 1
-
-                        # Add xref into core DB
-                        info_text = xref_row.info_text
-                        if not info_text:
-                            info_text = where_from
-                        xref_args = {
-                            "xref_id": xref_id,
-                            "accession": xref_row.accession,
-                            "external_db_id": external_id,
-                            "label": xref_row.label,
-                            "description": xref_row.description,
-                            "version": xref_row.version,
-                            "info_type": xref_row.info_type,
-                            "info_text": info_text,
-                        }
-                        xref_id = self.add_xref(xref_offset, xref_args, core_dbi)
-                        last_xref_id = xref_id
-
-                    # Add object xref into core DB
-                    object_xref_args = {
-                        "object_xref_id": object_xref_id,
-                        "ensembl_id": xref_row.ensembl_id,
-                        "ensembl_type": xref_row.ensembl_object_type,
-                        "xref_id": xref_id + xref_offset,
-                        "analysis_id": analysis_ids[xref_row.ensembl_object_type],
-                    }
-                    object_xref_id = self.add_object_xref(
-                        object_xref_offset, object_xref_args, core_dbi
-                    )
-
-                    # Add identity xref into core DB
-                    query = (
-                        insert(IdentityXrefCORM)
-                        .values(
-                            object_xref_id=object_xref_id + object_xref_offset,
-                            xref_identity=xref_row.query_identity,
-                            ensembl_identity=xref_row.target_identity,
-                            xref_start=xref_row.hit_start,
-                            xref_end=xref_row.hit_end,
-                            ensembl_start=xref_row.translation_start,
-                            ensembl_end=xref_row.translation_end,
-                            cigar_line=xref_row.cigar_line,
-                            score=xref_row.score,
-                            evalue=xref_row.evalue,
+                        # Get synonyms
+                        query = select(SynonymORM.xref_id, SynonymORM.synonym).where(
+                            SynonymORM.xref_id.in_(xref_list)
                         )
-                        .prefix_with("IGNORE")
-                    )
-                    core_dbi.execute(query)
-
-                logging.info(
-                    f"\tLoaded {count} SEQUENCE_MATCH xrefs for '{species_name}'"
-                )
-            else:
-                logging.debug(f"\tPROBLEM: what type is {source_row.info_type}")
+                        for syn_row in xref_dbi.execute(query).mappings().all():
+                            session.execute(
+                                insert(ExternalSynonymORM).values(
+                                    xref_id=syn_row.xref_id + xref_offset,
+                                    synonym=syn_row.synonym,
+                                )
+                            )
+                            syn_count += 1
 
-            # Transfer synonym data
-            if len(xref_list) > 0:
-                syn_count = 0
+                        logging.info(f"\tLoaded {syn_count} synonyms for '{species_name}'")
 
-                # Get synonyms
-                query = select(SynonymORM.xref_id, SynonymORM.synonym).where(
-                    SynonymORM.xref_id.in_(xref_list)
-                )
-                for syn_row in xref_dbi.execute(query).mappings().all():
-                    core_dbi.execute(
-                        insert(ExternalSynonymORM).values(
-                            xref_id=syn_row.xref_id + xref_offset,
-                            synonym=syn_row.synonym,
+                        # Set dumped status
+                        xref_dbi.execute(
+                            update(XrefUORM)
+                            .values(dumped="MAPPED")
+                            .where(XrefUORM.xref_id.in_(xref_list))
                         )
-                    )
-
-                    syn_count += 1
 
-                logging.info(f"\tLoaded {syn_count} synonyms for '{species_name}'")
-
-                # Set dumped status
-                xref_dbi.execute(
-                    update(XrefUORM)
-                    .values(dumped="MAPPED")
-                    .where(XrefUORM.xref_id.in_(xref_list))
-                )
+                    # Update release info
+                    if source_row.source_release and source_row.source_release != "1":
+                        session.execute(
+                            update(ExternalDbORM)
+                            .values(db_release=source_row.source_release)
+                            .where(ExternalDbORM.external_db_id == external_id)
+                        )
 
-            # Update release info
-            if source_row.source_release and source_row.source_release != "1":
-                core_dbi.execute(
-                    update(ExternalDbORM)
-                    .values(db_release=source_row.source_release)
-                    .where(ExternalDbORM.external_db_id == external_id)
-                )
+                    session.commit()
+                except SQLAlchemyError as e:
+                    session.rollback()
+                    logging.error(f"Failed to load xrefs for source '{source_row.name}': {e}")
+                    raise RuntimeError(f"Transaction failed for source '{source_row.name}'")
 
         # Update the unmapped xrefs
         self.update_unmapped_xrefs(xref_dbi)
@@ -511,7 +498,7 @@ def delete_projection_data(self, dbi: Connection) -> None:
             f"Deleted all PROJECTIONs rows: {counts['external_synonym']} external_synonyms, {counts['dependent_xref']} dependent_xrefs, {counts['object_xref']} object_xrefs, {counts['xref']} xrefs"
         )
 
-    def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection, core_dbi: Connection) -> None:
+    def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection) -> None:
         # For each external_db to be updated, delete the existing xrefs
         query = (
             select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count"))
@@ -522,92 +509,111 @@ def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_db
             .group_by(SourceUORM.name)
         )
         for row in xref_dbi.execute(query).mappings().all():
-            if not name_to_external_db_id.get(row.name):
+            name = row.name
+            external_db_id = name_to_external_db_id.get(name)
+            if not external_db_id:
                 continue
 
-            name = row.name
-            external_db_id = name_to_external_db_id[name]
-            counts = {"master_dependent_xref": 0, "master_object_xref": 0}
+            counts = {
+                "gene": 0,
+                "external_synonym": 0,
+                "identity_xref": 0,
+                "object_xref": 0,
+                "master_dependent_xref": 0,
+                "master_object_xref": 0,
+                "dependent_xref": 0,
+                "xref": 0,
+                "unmapped_object": 0,
+            }
 
             logging.info(f"For source '{name}'")
 
-            counts["gene"] = core_dbi.execute(
-                update(GeneORM)
-                .values(display_xref_id=None, description=None)
-                .where(
-                    GeneORM.display_xref_id == XrefCORM.xref_id,
-                    XrefCORM.external_db_id == external_db_id,
-                )
-            ).rowcount
-            logging.info(
-                f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)"
-            )
-
-            counts["external_synonym"] = core_dbi.execute(
-                delete(ExternalSynonymORM).where(
-                    ExternalSynonymORM.xref_id == XrefCORM.xref_id,
-                    XrefCORM.external_db_id == external_db_id,
-                )
-            ).rowcount
-            counts["identity_xref"] = core_dbi.execute(
-                delete(IdentityXrefCORM).where(
-                    IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id,
-                    ObjectXrefCORM.xref_id == XrefCORM.xref_id,
-                    XrefCORM.external_db_id == external_db_id,
-                )
-            ).rowcount
-            counts["object_xref"] = core_dbi.execute(
-                delete(ObjectXrefCORM).where(
-                    ObjectXrefCORM.xref_id == XrefCORM.xref_id,
-                    XrefCORM.external_db_id == external_db_id,
-                )
-            ).rowcount
-
-            MasterXref = aliased(XrefCORM)
-            DependentXref = aliased(XrefCORM)
-
-            query = select(
-                ObjectXrefCORM.object_xref_id,
-                DependentXrefCORM.master_xref_id,
-                DependentXrefCORM.dependent_xref_id,
-            ).where(
-                ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id,
-                MasterXref.xref_id == DependentXrefCORM.master_xref_id,
-                DependentXref.xref_id == DependentXrefCORM.dependent_xref_id,
-                MasterXref.external_db_id == external_db_id,
-            )
-            for row in core_dbi.execute(query).mappings().all():
-                counts["master_dependent_xref"] += core_dbi.execute(
-                    delete(DependentXrefCORM).where(
-                        DependentXrefCORM.master_xref_id == row.master_xref_id,
-                        DependentXrefCORM.dependent_xref_id == row.dependent_xref_id,
+            Session = sessionmaker(bind=self.core().execution_options(isolation_level="READ COMMITTED"))
+            with Session.begin() as session:
+                try:
+                    counts["gene"] = session.execute(
+                        update(GeneORM)
+                        .values(display_xref_id=None, description=None)
+                        .where(
+                            GeneORM.display_xref_id == XrefCORM.xref_id,
+                            XrefCORM.external_db_id == external_db_id,
+                        )
+                    ).rowcount
+                    logging.info(
+                        f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)"
                     )
-                ).rowcount
-                counts["master_object_xref"] += core_dbi.execute(
-                    delete(ObjectXrefCORM).where(
-                        ObjectXrefCORM.object_xref_id == row.object_xref_id
+
+                    counts["external_synonym"] = session.execute(
+                        delete(ExternalSynonymORM).where(
+                            ExternalSynonymORM.xref_id == XrefCORM.xref_id,
+                            XrefCORM.external_db_id == external_db_id,
+                        )
+                    ).rowcount
+                    counts["identity_xref"] = session.execute(
+                        delete(IdentityXrefCORM).where(
+                            IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id,
+                            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+                            XrefCORM.external_db_id == external_db_id,
+                        )
+                    ).rowcount
+                    counts["object_xref"] = session.execute(
+                        delete(ObjectXrefCORM).where(
+                            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+                            XrefCORM.external_db_id == external_db_id,
+                        )
+                    ).rowcount
+
+                    MasterXref = aliased(XrefCORM)
+                    DependentXref = aliased(XrefCORM)
+
+                    query = select(
+                        ObjectXrefCORM.object_xref_id,
+                        DependentXrefCORM.master_xref_id,
+                        DependentXrefCORM.dependent_xref_id,
+                    ).where(
+                        ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id,
+                        MasterXref.xref_id == DependentXrefCORM.master_xref_id,
+                        DependentXref.xref_id == DependentXrefCORM.dependent_xref_id,
+                        MasterXref.external_db_id == external_db_id,
                     )
-                ).rowcount
+                    for sub_row in session.execute(query).mappings().all():
+                        counts["master_dependent_xref"] += session.execute(
+                            delete(DependentXrefCORM).where(
+                                DependentXrefCORM.master_xref_id == sub_row.master_xref_id,
+                                DependentXrefCORM.dependent_xref_id == sub_row.dependent_xref_id,
+                            )
+                        ).rowcount
+                        counts["master_object_xref"] += session.execute(
+                            delete(ObjectXrefCORM).where(
+                                ObjectXrefCORM.object_xref_id == sub_row.object_xref_id
+                            )
+                        ).rowcount
 
-            counts["dependent_xref"] = core_dbi.execute(
-                delete(DependentXrefCORM).where(
-                    DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id,
-                    XrefCORM.external_db_id == external_db_id,
-                )
-            ).rowcount
-            counts["xref"] = core_dbi.execute(
-                delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id)
-            ).rowcount
-            counts["unmapped_object"] = core_dbi.execute(
-                delete(UnmappedObjectORM).where(
-                    UnmappedObjectORM.unmapped_object_type == "xref",
-                    UnmappedObjectORM.external_db_id == external_db_id,
-                )
-            ).rowcount
+                    counts["dependent_xref"] = session.execute(
+                        delete(DependentXrefCORM).where(
+                            DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id,
+                            XrefCORM.external_db_id == external_db_id,
+                        )
+                    ).rowcount
+                    counts["xref"] = session.execute(
+                        delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id)
+                    ).rowcount
+                    counts["unmapped_object"] = session.execute(
+                        delete(UnmappedObjectORM).where(
+                            UnmappedObjectORM.unmapped_object_type == "xref",
+                            UnmappedObjectORM.external_db_id == external_db_id,
+                        )
+                    ).rowcount
 
-            logging.info(
-                f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects"
-            )
+                    logging.info(
+                        f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects"
+                    )
+
+                    session.commit()
+                except SQLAlchemyError as e:
+                    session.rollback()
+                    logging.error(f"Failed to delete rows for source '{name}': {e}")
+                    raise RuntimeError(f"Transaction failed for source '{name}'")
 
     def get_analysis(self, dbi: Connection) -> Dict[str, int]:
         analysis_ids = {}
@@ -617,19 +623,21 @@ def get_analysis(self, dbi: Connection) -> Dict[str, int]:
             "Translation": "xrefexonerateprotein",
         }
 
-        for object_type in ["Gene", "Transcript", "Translation"]:
-            logic_name = type_to_logic_name[object_type]
+        for object_type, logic_name in type_to_logic_name.items():
             analysis_ids[object_type] = self.get_single_analysis(logic_name, dbi)
 
+        # Add checksum analysis ID
         analysis_ids["checksum"] = self.get_single_analysis("xrefchecksum", dbi)
 
         return analysis_ids
 
     def get_single_analysis(self, logic_name: str, dbi: Connection) -> int:
+        # Retrieve the analysis ID for the given logic name
         analysis_id = dbi.execute(
             select(AnalysisORM.analysis_id).where(AnalysisORM.logic_name == logic_name)
         ).scalar()
 
+        # If the analysis ID does not exist, create a new analysis entry
         if not analysis_id:
             Session = sessionmaker(self.core())
             with Session.begin() as session:
@@ -641,7 +649,7 @@ def get_single_analysis(self, logic_name: str, dbi: Connection) -> int:
 
         return analysis_id
 
-    def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
+    def add_xref(self, offset: int, args: Dict[str, Any], session: Session) -> int:
         xref_id = args["xref_id"]
         accession = args["accession"]
         external_db_id = args["external_db_id"]
@@ -651,7 +659,8 @@ def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
         info_type = args["info_type"]
         info_text = args["info_text"]
 
-        new_xref_id = dbi.execute(
+        # Check if the xref already exists
+        new_xref_id = session.execute(
             select(XrefCORM.xref_id).where(
                 XrefCORM.dbprimary_acc == accession,
                 XrefCORM.external_db_id == external_db_id,
@@ -661,8 +670,9 @@ def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
             )
         ).scalar()
 
+        # If it doesn't exist, insert it
         if not new_xref_id:
-            dbi.execute(
+            session.execute(
                 insert(XrefCORM).values(
                     xref_id=xref_id + offset,
                     external_db_id=external_db_id,
@@ -674,19 +684,19 @@ def add_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
                     info_text=info_text,
                 )
             )
-
             return xref_id
         else:
             return int(new_xref_id) - offset
 
-    def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) -> int:
+    def add_object_xref(self, offset: int, args: Dict[str, Any], session: Session) -> int:
         object_xref_id = args["object_xref_id"]
         ensembl_id = args["ensembl_id"]
         ensembl_type = args["ensembl_type"]
         xref_id = args["xref_id"]
         analysis_id = args["analysis_id"]
 
-        new_object_xref_id = dbi.execute(
+        # Check if the object_xref already exists
+        new_object_xref_id = session.execute(
             select(ObjectXrefCORM.object_xref_id).where(
                 ObjectXrefCORM.xref_id == xref_id,
                 ObjectXrefCORM.ensembl_object_type == ensembl_type,
@@ -695,8 +705,9 @@ def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) ->
             )
         ).scalar()
 
+        # If it doesn't exist, insert it
         if not new_object_xref_id:
-            dbi.execute(
+            session.execute(
                 insert(ObjectXrefCORM).values(
                     object_xref_id=object_xref_id + offset,
                     ensembl_id=ensembl_id,
@@ -705,7 +716,6 @@ def add_object_xref(self, offset: int, args: Dict[str, Any], dbi: Connection) ->
                     analysis_id=analysis_id,
                 )
             )
-
             return object_xref_id
         else:
             return int(new_object_xref_id) - offset
@@ -724,8 +734,7 @@ def update_unmapped_xrefs(self, dbi: Connection) -> None:
                 XrefUORM.info_type == "DIRECT",
             )
         )
-        result = dbi.execute(query).fetchall()
-        xref_ids = [row[0] for row in result]
+        xref_ids = [row.xref_id for row in dbi.execute(query).mappings().all()]
         dbi.execute(
             update(XrefUORM)
             .values(dumped="UNMAPPED_NO_STABLE_ID")
@@ -761,8 +770,7 @@ def update_unmapped_xrefs(self, dbi: Connection) -> None:
                 DependentXref.info_type == "DEPENDENT",
             )
         )
-        result = dbi.execute(query).fetchall()
-        xref_ids = [row[0] for row in result]
+        xref_ids = [row.xref_id for row in dbi.execute(query).mappings().all()]
         dbi.execute(
             update(XrefUORM)
             .values(dumped="UNMAPPED_MASTER_FAILED")
@@ -784,15 +792,14 @@ def update_unmapped_xrefs(self, dbi: Connection) -> None:
                 XrefUORM.info_type == "SEQUENCE_MATCH",
             )
         )
-        result = dbi.execute(query).fetchall()
-        xref_ids = [row[0] for row in result]
+        xref_ids = [row.xref_id for row in dbi.execute(query).mappings().all()]
         dbi.execute(
             update(XrefUORM)
             .values(dumped="UNMAPPED_NO_MAPPING")
             .where(XrefUORM.xref_id.in_(xref_ids))
         )
 
-        # Dependents with non existent masters (none on time of loading)
+        # Dependents with non-existent masters (none at the time of loading)
         dbi.execute(
             update(XrefUORM)
             .values(dumped="UNMAPPED_NO_MASTER")
diff --git a/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py
index b97b858c7..dbf930854 100644
--- a/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py
+++ b/src/python/ensembl/production/xrefs/mappers/methods/ChecksumBasic.py
@@ -15,7 +15,6 @@
 """Base method module for handling checksums."""
 
 from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq
 import hashlib
 
@@ -26,7 +25,6 @@
 DEFAULT_BATCH_SIZE = 1000
 DEFAULT_LOG_SIZE = 10000
 
-
 class ChecksumBasic:
     def __init__(self, args: Dict[str, Any] = None) -> None:
         if args is None:
@@ -51,36 +49,29 @@ def batch_size(self, batch_size: int = None) -> int:
         return self._batch_size
 
     def run(self, target: str, source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]:
-        results, tmp_list = [], []
-        count, total_count = 0, 0
+        results = []
+        tmp_list = []
+        count = 0
+        total_count = 0
         batch_size = self.batch_size()
 
         for record in SeqIO.parse(target, "fasta"):
             tmp_list.append(record)
             count += 1
 
-            if (count % batch_size) == 0:
-                res = self.perform_mapping(tmp_list, source_id, object_type, dbi)
-                for row in res:
-                    results.append(row)
-
+            if count % batch_size == 0:
+                results.extend(self.perform_mapping(tmp_list, source_id, object_type, dbi))
                 total_count += count
-                if total_count % DEFAULT_LOG_SIZE:
-                    self.mapper().log_progress(
-                        f"Finished batch mapping of {total_count} sequences"
-                    )
+                if total_count % DEFAULT_LOG_SIZE == 0:
+                    self.mapper().log_progress(f"Finished batch mapping of {total_count} sequences")
                 count = 0
                 tmp_list.clear()
 
         # Final mapping if there were some left over
-        if len(tmp_list) > 0:
-            self.mapper().log_progress(
-                f"Finished batch mapping of {total_count} sequences"
-            )
-            res = self.perform_mapping(tmp_list, source_id, object_type, dbi)
-            for row in res:
-                results.append(row)
-            tmp_list.clear()
+        if tmp_list:
+            results.extend(self.perform_mapping(tmp_list, source_id, object_type, dbi))
+            total_count += count
+            self.mapper().log_progress(f"Finished batch mapping of {total_count} sequences")
 
         return results
 
diff --git a/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py
index 993753cd6..ed02c65ba 100644
--- a/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py
+++ b/src/python/ensembl/production/xrefs/mappers/methods/MySQLChecksum.py
@@ -14,11 +14,14 @@
 
 """Base method module for handling mysql checksums."""
 
-from ensembl.production.xrefs.mappers.methods.ChecksumBasic import *
-
 from sqlalchemy import select
+from typing import Any, Dict, List
+from Bio.SeqRecord import SeqRecord
+from sqlalchemy.engine import Connection
+
 from ensembl.xrefs.xref_source_db_model import ChecksumXref as ChecksumXrefSORM
 
+from ensembl.production.xrefs.mappers.methods.ChecksumBasic import ChecksumBasic
 
 class MySQLChecksum(ChecksumBasic):
     def perform_mapping(self, sequences: List[SeqRecord], source_id: int, object_type: str, dbi: Connection) -> List[Dict[str, Any]]:
@@ -32,13 +35,15 @@ def perform_mapping(self, sequences: List[SeqRecord], source_id: int, object_typ
                 ChecksumXrefSORM.checksum == checksum,
                 ChecksumXrefSORM.source_id == source_id,
             )
-            for row in dbi.execute(query).mappings().all():
-                local_upi = row.accession
-                if upi:
-                    raise LookupError(
-                        f"The sequence {sequence.id} had a checksum of {checksum} but this resulted in more than one UPI: [{upi}, {local_upi}]"
-                    )
-                upi = local_upi
+            results = dbi.execute(query).mappings().all()
+            
+            if len(results) > 1:
+                upis = [row.accession for row in results]
+                raise LookupError(
+                    f"The sequence {sequence.id} had a checksum of {checksum} but this resulted in more than one UPI: {upis}"
+                )
+            elif results:
+                upi = results[0].accession
 
             if upi:
                 final_results.append(
diff --git a/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py
index 3a2b20dbd..13ca0972b 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/aedes_aegypti.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species aedes_aegypti."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class aedes_aegypti(BasicMapper):
     def gene_description_sources(self) -> List[str]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py
index 46e30cf99..0191d0b61 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/anopheles_gambiae.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species anopheles_gambiae."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class anopheles_gambiae(BasicMapper):
     def gene_description_sources(self) -> List[str]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py
index 36a5f6696..77725da25 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/culex_quinquefasciatus.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species culex_quinquefasciatus."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class culex_quinquefasciatus(BasicMapper):
     def gene_description_sources(self) -> List[str]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py
index 3a2b155ec..af81a04e9 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/danio_rerio.py
@@ -14,10 +14,9 @@
 
 """Mapper extension module for species danio_rerio."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
 
-
 class danio_rerio(BasicMapper):
     def set_display_xrefs(self) -> None:
         display = DisplayXrefs(self)
diff --git a/src/python/ensembl/production/xrefs/mappers/species/drosophila.py b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py
index 2e327a735..3d1b5fb83 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/drosophila.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/drosophila.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species drosophila."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class drosophila(BasicMapper):
     def gene_description_filter_regexps(self) -> List[str]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py
index 1791da9c5..00be97b20 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/eukaryota.py
@@ -14,8 +14,29 @@
 
 """Mapper extension module for species eukaryota."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from typing import Dict, List, Tuple
+from sqlalchemy.orm import aliased
+from sqlalchemy import select, update, func, delete
+from sqlalchemy.sql.expression import Select
+from sqlalchemy.dialects.mysql import insert
+
+from ensembl.xrefs.xref_update_db_model import (
+    Source as SourceUORM,
+    Xref as XrefUORM,
+    DependentXref as DependentXrefUORM,
+    ObjectXref as ObjectXrefUORM
+)
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    ObjectXref as ObjectXrefCORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class eukaryota(BasicMapper):
     def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py
index 616bd7326..b19bbf9be 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/homo_sapiens.py
@@ -14,8 +14,8 @@
 
 """Mapper extension module for species homo_sapiens."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
+from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
 
 class homo_sapiens(BasicMapper):
     def official_name(self) -> str:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py
index 5861e03a7..10cc0d739 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/ixodes_scapularis.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species ixodes_scapularis."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class ixodes_scapularis(BasicMapper):
     def gene_description_sources(self) -> List[str]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py
index cde22b34f..307b0c7b0 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/mus_musculus.py
@@ -14,8 +14,8 @@
 
 """Mapper extension module for species mus_musculus."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
+from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
 
 class mus_musculus(BasicMapper):
     def official_name(self) -> str:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py
index df2bb072c..93725c0b6 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/neurospora_crassa.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species neurospora_crassa."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class neurospora_crassa(BasicMapper):
     def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/parasite.py b/src/python/ensembl/production/xrefs/mappers/species/parasite.py
index 408d84d08..81fd43753 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/parasite.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/parasite.py
@@ -14,8 +14,9 @@
 
 """Mapper extension module for species parasite."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import List
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class parasite(BasicMapper):
     def set_transcript_names(self) -> None:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py
index 53925875d..4ab5a86f3 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/rattus_norvegicus.py
@@ -14,8 +14,8 @@
 
 """Mapper extension module for species rattus_norvegicus."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
+from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
 
 class rattus_norvegicus(BasicMapper):
     def official_name(self) -> str:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py
index 707dcc7db..088cf7f42 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/saccharomyces_cerevisiae.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species saccharomyces_cerevisiae."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class saccharomyces_cerevisiae(BasicMapper):
     def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py
index 742f1207c..c8547e1af 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/sars_cov_2.py
@@ -14,8 +14,18 @@
 
 """Mapper extension module for species sars_cov_2."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+import logging
+from sqlalchemy import delete, insert, select, update, func
+
+from ensembl.core.models import (
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    ObjectXref as ObjectXrefCORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class sars_cov_2(BasicMapper):
     def set_transcript_names(self) -> None:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py
index 8c7d66d8e..c0f9adfbd 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/schizosaccharomyces_pombe.py
@@ -14,8 +14,10 @@
 
 """Mapper extension module for species schizosaccharomyces_pombe."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+from typing import Dict, List, Tuple
+from sqlalchemy.sql.expression import Select
 
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class schizosaccharomyces_pombe(BasicMapper):
     def gene_display_xref_sources(self) -> Tuple[List[str], Dict[str, Select]]:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py
index a3182e7f7..fb733b7a6 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/sus_scrofa.py
@@ -14,8 +14,8 @@
 
 """Mapper extension module for species sus_scrofa."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
-
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
+from ensembl.production.xrefs.mappers.DisplayXrefs import DisplayXrefs
 
 class sus_scrofa(BasicMapper):
     def official_name(self) -> str:
diff --git a/src/python/ensembl/production/xrefs/mappers/species/wormbase.py b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py
index 796d6260e..c666bc3bc 100644
--- a/src/python/ensembl/production/xrefs/mappers/species/wormbase.py
+++ b/src/python/ensembl/production/xrefs/mappers/species/wormbase.py
@@ -14,8 +14,19 @@
 
 """Mapper extension module for species wormbase."""
 
-from ensembl.production.xrefs.mappers.BasicMapper import *
+import logging
+from typing import List
+from sqlalchemy.sql.expression import select, update
 
+from ensembl.core.models import (
+    Gene as GeneORM,
+    Transcript as TranscriptORM,
+    Xref as XrefCORM,
+    ExternalDb as ExternalDbORM,
+    ObjectXref as ObjectXrefCORM
+)
+
+from ensembl.production.xrefs.mappers.BasicMapper import BasicMapper
 
 class wormbase(BasicMapper):
     def set_display_xrefs(self) -> None:
@@ -41,7 +52,6 @@ def set_display_xrefs(self) -> None:
                 "Could not find wormbase_transcript and wormbase_locus in external_db table, so doing nothing"
             )
 
-            xref_dbi.close()
             core_dbi.close()
 
             return
diff --git a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
index 53e78e887..f7e166a5d 100644
--- a/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ArrayExpressParser.py
@@ -30,7 +30,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         species_id = args.get("species_id")
         species_name = args.get("species_name")
         xref_file = args.get("file", "")
-        dba = args.get("dba")
+        db_url = args.get("extra_db_url")
         ensembl_release = args.get("ensembl_release")
         xref_dbi = args.get("xref_dbi")
         verbose = args.get("verbose", False)
@@ -62,7 +62,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         # Connect to the appropriate arrayexpress db
         arrayexpress_db_url = self.get_arrayexpress_db_url(
-            project, db_user, db_pass, db_host, db_port, db_name, species_name, ensembl_release, dba, verbose
+            project, db_user, db_pass, db_host, db_port, db_name, species_name, ensembl_release, db_url, verbose
         )
 
         if not arrayexpress_db_url:
@@ -118,7 +118,7 @@ def is_arryaexpress_active(self, species_lookup: Dict[str, bool], names: List[st
                 return True
         return False
 
-    def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_host: str, db_port: str, db_name: str, species_name: str, ensembl_release: str, dba: str, verbose: bool) -> Optional[URL]:
+    def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_host: str, db_port: str, db_name: str, species_name: str, ensembl_release: str, db_url: str, verbose: bool) -> Optional[URL]:
         if db_host:
             return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name)
         elif project == "ensembl":
@@ -130,13 +130,13 @@ def get_arrayexpress_db_url(self, project: str, db_user: str, db_pass: str, db_h
             if verbose:
                 logging.info("Looking for db in mysql-eg-staging-1 and mysql-eg-staging-2")
             registry = "ensro@mysql-eg-staging-1.ebi.ac.uk:4160"
-            db_url = self.get_db_from_registry(species_name, "core", ensembl_release, registry)
-            if not db_url:
+            sta_db_url = self.get_db_from_registry(species_name, "core", ensembl_release, registry)
+            if not sta_db_url:
                 registry = "ensro@mysql-eg-staging-2.ebi.ac.uk:4275"
                 return self.get_db_from_registry(species_name, "core", ensembl_release, registry)
+            return sta_db_url
+        elif db_url:
             return db_url
-        elif dba:
-            return dba
 
         return None
 
diff --git a/src/python/ensembl/production/xrefs/parsers/BaseParser.py b/src/python/ensembl/production/xrefs/parsers/BaseParser.py
index ad6440e37..baae025ed 100644
--- a/src/python/ensembl/production/xrefs/parsers/BaseParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/BaseParser.py
@@ -142,7 +142,7 @@ def set_release(self, source_id: int, s_release: str, dbi: Connection) -> None:
                 .values(source_release=s_release)
             )
 
-    def upload_xref_object_graphs(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None:
+    def add_xref_objects(self, xrefs: List[Dict[str, Any]], dbi: Connection) -> None:
         """Adds xref data into a database.
         Uploads main xref data, related direct xrefs, dependent xrefs, and synonyms.
 
@@ -356,45 +356,6 @@ def add_direct_xref(self, general_xref_id: int, ensembl_stable_id: str, ensembl_
             )
         )
 
-    def add_to_direct_xrefs(self, args: Dict[str, Any], dbi: Connection) -> None:
-        """Adds direct xref data into both the xref table and direct xref tables in a database.
-        This calls the functions add_xref and add_direct_xref.
-
-        Parameters
-        ----------
-        args: dict
-            The direct xref arguments. These include:
-            - stable_id: The ensEMBL feature stable ID
-            - ensembl_type: The feature type (gene, transcript, or translation)
-            - accession: The xref accession
-            - source_id: The xref source ID
-            - species_id: The species ID
-            - version (optional): The xref version (default is 0)
-            - label (optional): The xref label (default is the xref accession)
-            - description (optional): The xref description
-            - linkage (optional): The type of link between the xref and ensEMBL
-            - info_text (optional): Additional info related to the xref (default is empty string)
-            - info_type (optional): The type of xref being added (default is DIRECT)
-        dbi: sqlalchemy.engine.Connection
-            The database connection to update in
-        """
-        stable_id = args["stable_id"]
-        ensembl_type = args["ensembl_type"]
-        accession = args["accession"]
-        source_id = args["source_id"]
-        species_id = args["species_id"]
-        version = args.get("version", 0)
-        label = args.get("label", accession)
-        description = args.get("description")
-        linkage = args.get("linkage")
-        info_text = args.get("info_text", "")
-
-        args["info_type"] = args.get("info_type", "DIRECT")
-
-        # If the accession already has an xref find it else cretae a new one
-        direct_xref_id = self.add_xref(args, dbi)
-        self.add_direct_xref(direct_xref_id, stable_id, ensembl_type, linkage, dbi)
-
     def get_direct_xref_id(self, stable_id: str, ensembl_type: str, link: str, dbi: Connection) -> int:
         """Retrieves the direct xref row ID from stable ID, ensEMBL type and linkage type.
 
@@ -710,7 +671,7 @@ def build_dependent_mappings(self, source_id: int, dbi: Connection) -> None:
                 f"{row.master_xref_id}|{row.dependent_xref_id}"
             ] = row.linkage_annotation
 
-    def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]:
+    def get_acc_to_xref_ids(self, source_name: str, species_id: int, dbi: Connection) -> Dict[str, List[int]]:
         """Retrieves the xref accessions and IDs related to a specific xref source and species from a database.
 
         Parameters
@@ -726,7 +687,7 @@ def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) ->
         -------
         A dict variable containing {'accession' : [list of xref IDs]} items.
         """
-        valid_codes = {}
+        acc_to_xref_ids = {}
         sources = []
 
         big_name = "%" + source_name.upper() + "%"
@@ -741,9 +702,9 @@ def get_valid_codes(self, source_name: str, species_id: int, dbi: Connection) ->
                 XrefUORM.species_id == species_id, XrefUORM.source_id == source_id
             )
             for row in dbi.execute(query).fetchall():
-                valid_codes.setdefault(row[0], []).append(row[1])
+                acc_to_xref_ids.setdefault(row[0], []).append(row[1])
 
-        return valid_codes
+        return acc_to_xref_ids
 
     def is_file_header_valid(self, columns_count: int, field_patterns: List[str], header: List[str], case_sensitive: bool = False) -> bool:
         """Checks whether the provided file header is valid by checking length and column patterns.
@@ -780,7 +741,7 @@ def is_file_header_valid(self, columns_count: int, field_patterns: List[str], he
         return True
 
     def add_to_syn(self, accession: str, source_id: int, synonym: str, species_id: int, dbi: Connection) -> None:
-        """Add synomyn data for an xref given its accession and source ID.
+        """Adds synomyn data for an xref given its accession and source ID.
 
         Parameters
         ----------
diff --git a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
index 24d1e088c..159638916 100644
--- a/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/CCDSParser.py
@@ -33,7 +33,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         source_id = args.get("source_id")
         species_id = args.get("species_id")
         xref_file = args.get("file", "")
-        dba = args.get("dba")
+        db_url = args.get("extra_db_url")
         xref_dbi = args.get("xref_dbi")
         verbose = args.get("verbose", False)
 
@@ -53,8 +53,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
             ccds_db_url = URL.create(
                 "mysql", db_user, db_pass, db_host, db_port, db_name
             )
-        elif dba:
-            ccds_db_url = dba
+        elif db_url:
+            ccds_db_url = db_url
 
         if not ccds_db_url:
             return 1, "Could not find CCDS DB."
diff --git a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
index 699c633ba..dc0d5720f 100644
--- a/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/EntrezGeneParser.py
@@ -35,6 +35,10 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         if not source_id or not species_id or not xref_file:
             raise AttributeError("Missing required arguments: source_id, species_id, and file")
+        
+        wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi)
+        if verbose:
+            logging.info(f"Wiki source id = {wiki_source_id}")
 
         with self.get_filehandle(xref_file) as file_io:
             if file_io.read(1) == '':
@@ -64,10 +68,6 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
             if not self.is_file_header_valid(self.EXPECTED_NUMBER_OF_COLUMNS, patterns, header):
                 raise ValueError(f"Malformed or unexpected header in EntrezGene file {xref_file}")
 
-            wiki_source_id = self.get_source_id_for_source_name("WikiGene", xref_dbi)
-            if verbose:
-                logging.info(f"Wiki source id = {wiki_source_id}")
-
             processed_count, syn_count = self.process_lines(csv_reader, source_id, species_id, wiki_source_id, xref_dbi)
 
         result_message = f"{processed_count} EntrezGene Xrefs and {processed_count} WikiGene Xrefs added with {syn_count} synonyms"
diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
index b8bca4e45..d64eead15 100644
--- a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
@@ -24,7 +24,7 @@
 from sqlalchemy import select
 from sqlalchemy.engine import Connection
 from sqlalchemy.engine.url import URL
-from unidecode import unidecode
+from unidecode import unidecode # type: ignore
 
 from ensembl.core.models import (
     Transcript as TranscriptORM,
@@ -39,7 +39,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         source_id = args.get("source_id")
         species_id = args.get("species_id")
         xref_file = args.get("file")
-        dba = args.get("dba")
+        db_url = args.get("extra_db_url")
         xref_dbi = args.get("xref_dbi")
         verbose = args.get("verbose", False)
 
@@ -66,7 +66,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         name_count = {key: 0 for key in source_ids}
 
         # Connect to the ccds db
-        ccds_db_url = dba or self.construct_db_url(file_params)
+        ccds_db_url = db_url or self.construct_db_url(file_params)
         if not ccds_db_url:
             raise AttributeError("No ensembl ccds database provided")
         if verbose:
@@ -95,11 +95,13 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         result_message += f"{syn_count} synonyms added\n"
         result_message += f"{name_count['desc_only']} HGNC ids could not be associated in xrefs"
 
+        result_message = re.sub(r"\n", "--", result_message)
+
         return 0, result_message
 
     def process_lines(self, csv_reader: csv.DictReader, source_ids: Dict[str, int], name_count: Dict[str, int], species_id: int, ccds_db_url: str, xref_dbi: Connection) -> int:
         # Prepare lookup lists
-        refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+        refseq = self.get_acc_to_xref_ids("refseq", species_id, xref_dbi)
         source_list = ["refseq_peptide", "refseq_mRNA"]
         entrezgene = self.get_valid_xrefs_for_dependencies("EntrezGene", source_list, xref_dbi)
 
diff --git a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
index 94dc7466a..c156c6758 100644
--- a/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/JGI_ProteinParser.py
@@ -60,7 +60,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 }
                 xrefs.append(xref)
 
-        self.upload_xref_object_graphs(xrefs, xref_dbi)
+        self.add_xref_objects(xrefs, xref_dbi)
 
         result_message = f"{len(xrefs)} JGI_ xrefs successfully parsed"
 
diff --git a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
index 4a1654fc5..cd63875b4 100644
--- a/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/Mim2GeneParser.py
@@ -71,9 +71,9 @@ def process_lines(self, csv_reader: csv.reader, xref_file:str, species_id: int,
         self.build_dependent_mappings(mim_gene_source_id, xref_dbi)
         self.build_dependent_mappings(mim_morbid_source_id, xref_dbi)
 
-        mim_gene = self.get_valid_codes("MIM_GENE", species_id, xref_dbi)
-        mim_morbid = self.get_valid_codes("MIM_MORBID", species_id, xref_dbi)
-        entrez = self.get_valid_codes("EntrezGene", species_id, xref_dbi)
+        mim_gene = self.get_acc_to_xref_ids("MIM_GENE", species_id, xref_dbi)
+        mim_morbid = self.get_acc_to_xref_ids("MIM_MORBID", species_id, xref_dbi)
+        entrez = self.get_acc_to_xref_ids("EntrezGene", species_id, xref_dbi)
 
         # Read lines
         for line in csv_reader:
diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
index e760cbf9e..4f13abd3e 100644
--- a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
@@ -17,7 +17,7 @@
 import logging
 import os
 import re
-import wget
+import wget # type: ignore
 from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 from sqlalchemy import and_, select
@@ -44,7 +44,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         species_id = args.get("species_id")
         species_name = args.get("species_name")
         xref_file = args.get("file")
-        dba = args.get("dba")
+        db_url = args.get("extra_db_url")
         ensembl_release = args.get("ensembl_release")
         xref_dbi = args.get("xref_dbi")
         verbose = args.get("verbose", False)
@@ -70,7 +70,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         species_name = species_id_to_names[species_id][0]
 
         # Connect to the appropriate rfam db
-        rfam_db_url = self.get_rfam_db_url(db_host, db_user, db_pass, db_port, db_name, dba, species_name, ensembl_release, verbose)
+        rfam_db_url = self.get_rfam_db_url(db_host, db_user, db_pass, db_port, db_name, db_url, species_name, ensembl_release, verbose)
         if not rfam_db_url:
             raise AttributeError("Could not find RFAM DB.")
         if verbose:
@@ -86,11 +86,11 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         result_message = f"Added {xref_count} RFAM xrefs and {direct_count} direct xrefs"
         return 0, result_message
 
-    def get_rfam_db_url(self, db_host: str, db_user: str, db_pass: str, db_port: str, db_name: str, dba: str, species_name: str, ensembl_release: str, verbose: bool) -> Any:
+    def get_rfam_db_url(self, db_host: str, db_user: str, db_pass: str, db_port: str, db_name: str, db_url: str, species_name: str, ensembl_release: str, verbose: bool) -> Any:
         if db_host:
             return URL.create("mysql", db_user, db_pass, db_host, db_port, db_name)
-        elif dba:
-            return dba
+        elif db_url:
+            return db_url
         else:
             if verbose:
                 logging.info("Looking for db in mysql-ens-sta-1")
diff --git a/src/python/ensembl/production/xrefs/parsers/RGDParser.py b/src/python/ensembl/production/xrefs/parsers/RGDParser.py
index 54b574e82..22284fd4a 100644
--- a/src/python/ensembl/production/xrefs/parsers/RGDParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RGDParser.py
@@ -55,7 +55,7 @@ def process_lines(self, csv_reader: csv.DictReader, source_id: int, direct_sourc
         dependent_count, ensembl_count, mismatch_count, syn_count = 0, 0, 0, 0
 
         # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
-        preloaded_refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+        preloaded_refseq = self.get_acc_to_xref_ids("refseq", species_id, xref_dbi)
 
         for line in csv_reader:
             # Don't bother doing anything if we don't have an RGD ID or if the symbol is an Ensembl ID
diff --git a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
index c00df2cc4..ba6ec9f25 100644
--- a/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ReactomeParser.py
@@ -100,7 +100,7 @@ def process_file(self, xref_file: str, alias_to_species_id: Dict[str, int], sour
 
         # Get existing uniprot accessions
         is_uniprot = bool(re.search("UniProt", xref_file))
-        uniprot_accessions = self.get_valid_codes("uniprot/", species_id, xref_dbi) if is_uniprot else {}
+        uniprot_accessions = self.get_acc_to_xref_ids("uniprot/", species_id, xref_dbi) if is_uniprot else {}
 
         with self.get_filehandle(xref_file) as file_io:
             if file_io.read(1) == '':
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
index 61662dcaf..699a0af83 100644
--- a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
@@ -27,7 +27,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         source_id = args.get("source_id")
         species_id = args.get("species_id")
         species_name = args.get("species_name")
-        dba = args.get("dba")
+        db_url = args.get("extra_db_url")
         xref_dbi = args.get("xref_dbi")
         verbose = args.get("verbose", False)
 
@@ -46,7 +46,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         species_name = species_id_to_names[species_id][0]
 
         # Connect to the appropriate dbs
-        if dba:
+        if db_url:
             return self.run_perl_script(args, source_ids, species_name)
         else:
             # Not all species have an otherfeatures database, skip if not found
@@ -85,14 +85,15 @@ def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], spec
 
         logging.info(f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl")
         perl_cmd = (
-            f"perl {scripts_dir}/refseq_coordinate_parser.pl "
-            f"--xref_db_url '{xref_db_url}' "
-            f"--core_db_url '{args.get('core_db_url')}' "
-            f"--otherf_db_url '{args.get('dba')}' "
-            f"--source_ids '{source_ids_json}' "
-            f"--species_id {args.get('species_id')} "
-            f"--species_name {species_name} "
-            f"--release {args.get('ensembl_release')}"
+            "perl",
+            f"{scripts_dir}/refseq_coordinate_parser.pl"
+            f"--xref_db_url", xref_db_url
+            f"--core_db_url", args.get('core_db_url'),
+            f"--otherf_db_url", args.get('extra_db_url'),
+            f"--source_ids", source_ids_json,
+            f"--species_id", str(species_id),
+            f"--species_name", species_name
+            f"--release", str(args.get('ensembl_release'))
         )
         cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py
index f9e62c218..9958c0490 100644
--- a/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqParser.py
@@ -138,10 +138,10 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name
 
         # Retrieve existing RefSeq mRNA, EntrezGene, and WikiGene xrefs
         entrez_acc_to_label = self.get_acc_to_label("EntrezGene", species_id, dbi)
-        refseq_ids = self.get_valid_codes("RefSeq_mRNA", species_id, dbi)
-        refseq_ids.update(self.get_valid_codes("RefSeq_mRNA_predicted", species_id, dbi))
-        entrez_ids = self.get_valid_codes("EntrezGene", species_id, dbi)
-        wiki_ids = self.get_valid_codes("WikiGene", species_id, dbi)
+        refseq_ids = self.get_acc_to_xref_ids("RefSeq_mRNA", species_id, dbi)
+        refseq_ids.update(self.get_acc_to_xref_ids("RefSeq_mRNA_predicted", species_id, dbi))
+        entrez_ids = self.get_acc_to_xref_ids("EntrezGene", species_id, dbi)
+        wiki_ids = self.get_acc_to_xref_ids("WikiGene", species_id, dbi)
 
         xrefs = []
 
@@ -217,7 +217,7 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, species_name
             xrefs.append(xref)
 
         if xrefs:
-            self.upload_xref_object_graphs(xrefs, dbi)
+            self.add_xref_objects(xrefs, dbi)
 
         result_message = (
             f'Added {counts["num_mrna"]} mRNA xrefs, {counts["num_pred_mrna"]} predicted mRNA xrefs, '
diff --git a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
index 1886c6fc6..42c2ceb01 100644
--- a/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/UniProtParser.py
@@ -344,12 +344,12 @@ def create_xrefs(self, source_ids: Dict[str, int], species_id: int, xref_file: s
             xrefs.append(xref)
 
             if count > 1000:
-                self.upload_xref_object_graphs(xrefs, dbi)
+                self.add_xref_objects(xrefs, dbi)
                 count = 0
                 xrefs.clear()
 
         if xrefs:
-            self.upload_xref_object_graphs(xrefs, dbi)
+            self.add_xref_objects(xrefs, dbi)
 
         result_message = (
             f'Read {counts["num_sp"]} SwissProt xrefs, {counts["num_sptr"]} SPTrEMBL xrefs with protein evidence codes 1-2, '
diff --git a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
index 2792af8ff..66b51dadf 100644
--- a/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/ZFINParser.py
@@ -53,8 +53,8 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                 descriptions[row.accession] = row.description
 
         # Get the Uniprot and RefSeq accessions
-        swiss = self.get_valid_codes("uniprot/swissprot", species_id, xref_dbi)
-        refseq = self.get_valid_codes("refseq", species_id, xref_dbi)
+        swiss = self.get_acc_to_xref_ids("uniprot/swissprot", species_id, xref_dbi)
+        refseq = self.get_acc_to_xref_ids("refseq", species_id, xref_dbi)
 
         file_dir = os.path.dirname(xref_file)
         counts = {"direct": 0, "uniprot": 0, "refseq": 0, "synonyms": 0, "mismatch": 0}
@@ -142,7 +142,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
                     counts["mismatch"] += 1
 
         # Get the added ZFINs
-        zfin = self.get_valid_codes("zfin", species_id, xref_dbi)
+        zfin = self.get_acc_to_xref_ids("zfin", species_id, xref_dbi)
 
         sources = []
         query = select(SourceUORM.source_id).where(SourceUORM.name.like("ZFIN_ID"))
diff --git a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
index cc90ea85c..7dfc965d2 100644
--- a/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/miRBaseParser.py
@@ -49,7 +49,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
         if not xrefs:
             return 0, "No xrefs added"
 
-        self.upload_xref_object_graphs(xrefs, xref_dbi)
+        self.add_xref_objects(xrefs, xref_dbi)
 
         result_message = f"Read {len(xrefs)} xrefs from {file}"
         return 0, result_message

From 0cdcc5862df4f644ca9ae63192554099b68b6602 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 6 Jan 2025 10:19:44 +0000
Subject: [PATCH 07/12] Test modifications

---
 src/python/test/xrefs/flatfiles/sources.json    | 16 ----------------
 .../test/xrefs/parsers/test_ccds_parser.py      |  2 +-
 src/python/test/xrefs/test_checksum.py          | 17 +++++++++++++++++
 src/python/test/xrefs/test_download_source.py   | 10 +++-------
 .../test/xrefs/test_schedule_alignment.py       | 16 +++-------------
 src/python/test/xrefs/test_schedule_download.py |  2 +-
 src/python/test/xrefs/test_schedule_parse.py    | 11 +++--------
 7 files changed, 28 insertions(+), 46 deletions(-)
 delete mode 100644 src/python/test/xrefs/flatfiles/sources.json

diff --git a/src/python/test/xrefs/flatfiles/sources.json b/src/python/test/xrefs/flatfiles/sources.json
deleted file mode 100644
index 1b45a2acb..000000000
--- a/src/python/test/xrefs/flatfiles/sources.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
-    {
-      "name" : "ArrayExpress",
-      "parser" : "ArrayExpressParser",
-      "file" : "Database",
-      "db" : "core",
-      "priority" : 1
-    },
-    {
-      "name" : "RNACentral",
-      "parser" : "ChecksumParser",
-      "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz",
-      "db" : "checksum",
-      "priority" : 1
-    }
-]
\ No newline at end of file
diff --git a/src/python/test/xrefs/parsers/test_ccds_parser.py b/src/python/test/xrefs/parsers/test_ccds_parser.py
index 4b22225ef..7e10c9a70 100644
--- a/src/python/test/xrefs/parsers/test_ccds_parser.py
+++ b/src/python/test/xrefs/parsers/test_ccds_parser.py
@@ -25,7 +25,7 @@ def run_and_validate_parsing(ccds_parser: CCDSParser, mock_xref_dbi: DBConnectio
         {
             "source_id": SOURCE_ID_CCDS,
             "species_id": SPECIES_ID_HUMAN,
-            "dba": "mock_ccds_db_url",
+            "extra_db_url": "mock_ccds_db_url",
             "xref_dbi": mock_xref_dbi,
         }
     )
diff --git a/src/python/test/xrefs/test_checksum.py b/src/python/test/xrefs/test_checksum.py
index 4d86ad0c7..db43bd998 100644
--- a/src/python/test/xrefs/test_checksum.py
+++ b/src/python/test/xrefs/test_checksum.py
@@ -2,6 +2,7 @@
 import os
 import shutil
 import datetime
+from sqlalchemy import text
 from typing import Any, Dict, Callable, Optional
 from ensembl.utils.database import DBConnection
 from test_helpers import check_row_count
@@ -24,6 +25,20 @@ def _create_checksum(args: Optional[Dict[str, Any]] = None) -> Checksum:
         return Checksum(args, True, True)
     return _create_checksum
 
+# Function to populate the database with sources
+def populate_source_db(mock_source_dbi: DBConnection):
+    source_data = [
+        [1, 'RNACentral', 'ChecksumParser'],
+        [2, 'UniParc', 'ChecksumParser'],
+    ]
+    for row in source_data:
+        mock_source_dbi.execute(
+            text("INSERT INTO source (source_id, name, parser) VALUES (:source_id, :name, :parser)"),
+            {"source_id": row[0], "name": row[1], "parser": row[2],}
+        )
+
+    mock_source_dbi.commit()
+
 # Test case to check if an error is raised when a mandatory parameter is missing
 def test_checksum_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
     test_missing_required_param("Checksum", DEFAULT_ARGS, "base_path")
@@ -32,6 +47,8 @@ def test_checksum_missing_required_param(test_missing_required_param: Callable[[
 
 # Test case to check successful run
 def test_successful_run(mock_source_dbi: DBConnection, checksum: Checksum, pytestconfig: pytest.Config):
+    populate_source_db(mock_source_dbi)
+
     # Setup for test parameters and create a Checksum instance
     test_scratch_path = pytestconfig.getoption("test_scratch_path")
     args = {
diff --git a/src/python/test/xrefs/test_download_source.py b/src/python/test/xrefs/test_download_source.py
index 4e537ab5b..3b4988677 100644
--- a/src/python/test/xrefs/test_download_source.py
+++ b/src/python/test/xrefs/test_download_source.py
@@ -30,13 +30,9 @@ def _create_download_source(args: Optional[Dict[str, Any]] = None) -> DownloadSo
 
 # Test case to check if an error is raised when a mandatory parameter is missing
 def test_download_source_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "base_path")
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "parser")
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "name")
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "priority")
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "source_db_url")
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "file")
-    test_missing_required_param("DownloadSource", DEFAULT_ARGS, "skip_download")
+    required_params = ["base_path", "parser", "name", "priority", "source_db_url", "file", "skip_download"]
+    for param in required_params:
+        test_missing_required_param("DownloadSource", DEFAULT_ARGS, param)
 
 # Test case to check if an error is raised when an invalid URL scheme is provided
 def test_invalid_url_scheme(download_source: DownloadSource, pytestconfig):
diff --git a/src/python/test/xrefs/test_schedule_alignment.py b/src/python/test/xrefs/test_schedule_alignment.py
index 2254a58e2..cbdee9952 100644
--- a/src/python/test/xrefs/test_schedule_alignment.py
+++ b/src/python/test/xrefs/test_schedule_alignment.py
@@ -35,19 +35,9 @@ def _create_schedule_alignment(args: Optional[Dict[str, Any]] = None) -> Schedul
 
 # Test case to check if an error is raised when a mandatory parameter is missing
 def test_schedule_alignment_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "species_name")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "release")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "ensembl_fasta")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_fasta")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "seq_type")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "xref_db_url")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "base_path")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "method")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "query_cutoff")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "target_cutoff")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_id")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "source_name")
-    test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, "job_index")
+    required_params = ["species_name", "release", "ensembl_fasta", "xref_fasta", "seq_type", "xref_db_url", "base_path", "method", "query_cutoff", "target_cutoff", "source_id", "source_name", "job_index"]
+    for param in required_params:
+        test_missing_required_param("ScheduleAlignment", DEFAULT_ARGS, param)
 
 # Test case to check successful run
 def test_successful_run(schedule_alignment: ScheduleAlignment, pytestconfig: pytest.Config):
diff --git a/src/python/test/xrefs/test_schedule_download.py b/src/python/test/xrefs/test_schedule_download.py
index 8c17eb123..817a74607 100644
--- a/src/python/test/xrefs/test_schedule_download.py
+++ b/src/python/test/xrefs/test_schedule_download.py
@@ -76,7 +76,7 @@ def test_successful_run(schedule_download: ScheduleDownload, pytestconfig):
     user_name = os.getenv("USER", "default_user")
     test_db_name = f"{user_name}_test_xref_source_db_{timestamp}"
     args = {
-        "config_file": "flatfiles/sources.json",
+        "config_file": "flatfiles/sources_download.json",
         "source_db_url": f"{test_mysql_url}/{test_db_name}",
         "reuse_db": False,
         "dataflow_output_path": test_scratch_path
diff --git a/src/python/test/xrefs/test_schedule_parse.py b/src/python/test/xrefs/test_schedule_parse.py
index 04b3cd4ca..b34e19a5a 100644
--- a/src/python/test/xrefs/test_schedule_parse.py
+++ b/src/python/test/xrefs/test_schedule_parse.py
@@ -68,14 +68,9 @@ def populate_source_db(mock_source_dbi: DBConnection):
 
 # Test case to check if an error is raised when a mandatory parameter is missing
 def test_schedule_parse_missing_required_param(test_missing_required_param: Callable[[str, Dict[str, Any], str], None]):
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "species_name")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "release")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "registry_url")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "priority")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "source_db_url")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "xref_db_url")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "get_species_file")
-    test_missing_required_param("ScheduleParse", DEFAULT_ARGS, "sources_config_file")
+    required_params = ["species_name", "release", "registry_url", "priority", "source_db_url", "xref_db_url", "get_species_file"]
+    for param in required_params:
+        test_missing_required_param("ScheduleParse", DEFAULT_ARGS, param)
 
 # Test case to check if an error is raised when priority is invalid
 def test_invalid_priority(schedule_parse: ScheduleParse):

From d24e06bcb105aaef12529243b3fbb81e6df94017 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-03.ebi.ac.uk>
Date: Mon, 6 Jan 2025 11:37:19 +0000
Subject: [PATCH 08/12] Bug fixes

---
 .../production/xrefs/AdvisoryXrefReport.py    |  20 +--
 .../ensembl/production/xrefs/Alignment.py     |  19 +--
 src/python/ensembl/production/xrefs/Base.py   |   2 +-
 .../ensembl/production/xrefs/DumpEnsembl.py   |   7 +-
 .../production/xrefs/ScheduleCleanup.py       |   2 -
 .../ensembl/production/xrefs/ScheduleParse.py |   8 +-
 .../xrefs/mappers/CoordinateMapper.py         |   6 +-
 .../production/xrefs/mappers/DisplayXrefs.py  |   9 +-
 .../xrefs/mappers/OfficialNaming.py           |   7 +-
 .../production/xrefs/mappers/XrefLoader.py    | 149 +++++++++---------
 .../production/xrefs/parsers/HGNCParser.py    |   2 +-
 .../production/xrefs/parsers/RFAMParser.py    |   2 +-
 .../xrefs/parsers/RefSeqCoordinateParser.py   |  33 ++--
 .../xrefs/flatfiles/sources_download.json     |  16 ++
 14 files changed, 144 insertions(+), 138 deletions(-)
 create mode 100644 src/python/test/xrefs/flatfiles/sources_download.json

diff --git a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
index c12ce0e6b..152fe6976 100644
--- a/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
+++ b/src/python/ensembl/production/xrefs/AdvisoryXrefReport.py
@@ -26,15 +26,17 @@ def run(self):
         datacheck_name: str = self.get_param("datacheck_name", {"type": str})
         datacheck_output: str = self.get_param("datacheck_output", {"type": str})
 
-        # Create or locate report file
-        report_file = self.get_path(
-            base_path, species_name, release, "dc_report", f"{datacheck_name}.log"
-        )
-
         # Return the quotation marks into the output
         datacheck_output = re.sub("__", "'", datacheck_output)
 
-        # Write datacheck result into file
-        with open(report_file, "a") as fh:
-            fh.write(datacheck_output)
-            fh.write("\n")
+        # Only interested in failed tests
+        if re.search("Failed test", datacheck_output):
+            # Create or locate report file
+            report_file = self.get_path(
+                base_path, species_name, release, "dc_report", f"{datacheck_name}.log"
+            )
+
+            # Write datacheck result into file
+            with open(report_file, "a") as fh:
+                fh.write(datacheck_output)
+                fh.write("\n")
diff --git a/src/python/ensembl/production/xrefs/Alignment.py b/src/python/ensembl/production/xrefs/Alignment.py
index 5edac0b00..abea37b53 100644
--- a/src/python/ensembl/production/xrefs/Alignment.py
+++ b/src/python/ensembl/production/xrefs/Alignment.py
@@ -15,6 +15,7 @@
 """Alignment module to map xref sequences into ensEMBL ones."""
 
 import re
+import shlex
 import subprocess
 from sqlalchemy.dialects.mysql import insert
 
@@ -45,23 +46,11 @@ def run(self):
         # Construct Exonerate command
         ryo = "xref:%qi:%ti:%ei:%ql:%tl:%qab:%qae:%tab:%tae:%C:%s\n"
         exe = subprocess.check_output(["which", "exonerate"]).decode("utf-8").strip()
-        command_string = [
-            exe,
-            "--showalignment", "FALSE",
-            "--showvulgar", "FALSE",
-            "--ryo", f"'{ryo}'",
-            "--gappedextension", "FALSE",
-            "--model", "'affine:local'",
-            method,
-            "--subopt", "no",
-            "--query", source,
-            "--target", target,
-            "--querychunktotal", str(max_chunks),
-            "--querychunkid", str(chunk)
-        ]
+        command_string = f"{exe} --showalignment FALSE --showvulgar FALSE --ryo '{ryo}' --gappedextension FALSE --model 'affine:local' {method} --subopt no --query {source} --target {target} --querychunktotal {max_chunks} --querychunkid {chunk}"
+        command_list = shlex.split(command_string)
 
         # Get exonerate hits
-        output = subprocess.run(command_string, stdout=subprocess.PIPE, text=True)
+        output = subprocess.run(command_list, capture_output=True, text=True)
 
         exit_code = abs(output.returncode)
         if exit_code == 0:
diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py
index 04aad4971..53a04f2d2 100644
--- a/src/python/ensembl/production/xrefs/Base.py
+++ b/src/python/ensembl/production/xrefs/Base.py
@@ -21,7 +21,7 @@
 import fnmatch
 import gzip
 import importlib
-import wget # type: ignore
+import wget
 import threading
 import logging
 import random
diff --git a/src/python/ensembl/production/xrefs/DumpEnsembl.py b/src/python/ensembl/production/xrefs/DumpEnsembl.py
index c34635f6d..00b219f87 100644
--- a/src/python/ensembl/production/xrefs/DumpEnsembl.py
+++ b/src/python/ensembl/production/xrefs/DumpEnsembl.py
@@ -40,18 +40,17 @@ def run(self):
             logging.info(f"Dna and peptide data already dumped for species '{species_name}', skipping.")
         else:
             scripts_dir: str = self.get_param("perl_scripts_dir", {"required": True, "type": str})
+            dump_script = os.path.join(scripts_dir, 'dump_ensembl.pl')
 
-            logging.info(f"Running perl script {scripts_dir}/dump_ensembl.pl")
+            logging.info(f"Running perl script {dump_script}")
             perl_cmd = [
-                "perl",
-                f"{scripts_dir}/dump_ensembl.pl",
+                "perl", dump_script,
                 "--cdna_path", cdna_path,
                 "--pep_path", pep_path,
                 "--species", species_name,
                 "--core_db_url", core_db_url,
                 "--release", str(release)
             ]
-            # subprocess.run(perl_cmd, check=True, stdout=subprocess.PIPE)
             subprocess.run(perl_cmd, capture_output=True, text=True, check=True)
 
         # Create jobs for peptide dumping and alignment
diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
index 19388b9fb..9ec5a8b7e 100644
--- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py
+++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
@@ -33,14 +33,12 @@ def run(self):
         source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
         clean_files: Optional[bool] = self.get_param("clean_files", {"type": bool})
         clean_dir: Optional[str] = self.get_param("clean_dir", {"type": str})
-        split_files_by_species: Optional[bool] = self.get_param("split_files_by_species", {"type": bool})
 
         logging.info("ScheduleCleanup starting with parameters:")
         logging.info(f"Param: base_path = {base_path}")
         logging.info(f"Param: source_db_url = {source_db_url}")
         logging.info(f"Param: clean_files = {clean_files}")
         logging.info(f"Param: clean_dir = {clean_dir}")
-        logging.info(f"Param: split_files_by_species = {split_files_by_species}")
 
         # Connect to source db
         db_engine = self.get_db_engine(source_db_url)
diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py
index 149eb1c71..2317025f3 100644
--- a/src/python/ensembl/production/xrefs/ScheduleParse.py
+++ b/src/python/ensembl/production/xrefs/ScheduleParse.py
@@ -122,6 +122,7 @@ def run(self):
 
         hgnc_path = None
         total_sources = 0
+        zfin_scheduled = False
 
         for source in sources:
             if source.name == "HGNC":
@@ -129,6 +130,9 @@ def run(self):
 
             if source.db == "checksum" or source.priority != order_priority:
                 continue
+            
+            if source.name == "ZFIN_ID" and zfin_scheduled:
+                continue
 
             dataflow_params = {
                 "species_name": species_name,
@@ -205,6 +209,7 @@ def run(self):
 
                 if source.name == "ZFIN_ID":
                     list_files = [list_files[0]]
+                    zfin_scheduled = True
 
                 for file in list_files:
                     if source.revision and file == source.revision:
@@ -213,8 +218,7 @@ def run(self):
                     dataflow_params["file_name"] = file
 
                     if re.search(r"^Uniprot", source.name) and hgnc_path:
-                        
-                        hgnc_files = glob.glob(hgnc_path + "/*")
+                        hgnc_files = glob.glob(os.path.join(hgnc_path, "*"))
                         dataflow_params["hgnc_file"] = hgnc_files[0]
 
                     self.write_output(dataflow_suffix, dataflow_params)
diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
index 6bf44f8bc..e8aac3f77 100644
--- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
@@ -114,10 +114,10 @@ def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir:
             if analysis_id:
                 logging.info(f"Analysis ID is {analysis_id}")
 
-            logging.info(f"Running perl script {scripts_dir}/coordinate_mapper.pl")
+            mapper_script = os.path.join(scripts_dir, 'coordinate_mapper.pl')
+            logging.info(f"Running perl script {mapper_script}")
             perl_cmd = [
-                "perl",
-                f"{scripts_dir}/coordinate_mapper.pl",
+                "perl", mapper_script,
                 "--xref_db_url", str(self.xref()),
                 "--core_db_url", str(self.core()),
                 "--species_id", str(species_id),
diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
index 22b6f61b7..964eb26cd 100644
--- a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
+++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
@@ -18,7 +18,8 @@
 import re
 from datetime import datetime
 from typing import Dict, List, Tuple
-from sqlalchemy import select, insert, update, delete, case, desc, func, aliased
+from sqlalchemy import select, insert, update, delete, case, desc, func
+from sqlalchemy.orm import aliased
 from sqlalchemy.engine import Connection
 from sqlalchemy.sql import Select
 
@@ -744,9 +745,9 @@ def set_display_xrefs_from_stable_table(self) -> None:
             TranscriptStableIdORM.internal_id, TranscriptStableIdORM.display_xref_id
         )
         for row in xref_dbi.execute(query).mappings().all():
-            xref_id = int(row.display_xref_id)
+            if row.display_xref_id:
+                xref_id = int(row.display_xref_id)
 
-            if xref_id:
                 # Set display xref ID
                 core_dbi.execute(
                     update(TranscriptORM)
@@ -757,9 +758,9 @@ def set_display_xrefs_from_stable_table(self) -> None:
         # Clean up synonyms linked to xrefs which are not display xrefs
         query = (
             select(ExternalSynonymORM)
+            .join(XrefCORM, XrefCORM.xref_id == ExternalSynonymORM.xref_id)
             .outerjoin(GeneORM, GeneORM.display_xref_id == XrefCORM.xref_id)
             .where(
-                ExternalSynonymORM.xref_id == XrefCORM.xref_id,
                 GeneORM.display_xref_id == None,
             )
         )
diff --git a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
index 82768724e..74976506c 100644
--- a/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
+++ b/src/python/ensembl/production/xrefs/mappers/OfficialNaming.py
@@ -17,7 +17,8 @@
 import logging
 import re
 from typing import Any, Dict, Tuple, List
-from sqlalchemy import select, func, update, case, desc, insert, aliased, delete
+from sqlalchemy import select, func, update, case, desc, insert, delete
+from sqlalchemy.orm import aliased
 from sqlalchemy.engine import Connection
 
 from ensembl.xrefs.xref_update_db_model import (
@@ -436,13 +437,13 @@ def get_official_domain_name(self, args: Dict[str, Any], dbi: Connection) -> Tup
     def set_the_best_display_name(self, display_names: Dict[int, bool], xref_list: List[int], object_xref_list: List[int], xref_id_to_display: Dict[int, str], verbose: bool, dbi: Connection) -> Tuple[str, int]:
         gene_symbol, gene_symbol_xref_id = None, None
 
-        for xref_id in xref_list:
+        for index,xref_id in enumerate(xref_list):
             # Remove object xrefs that are not in the best display names list
             if not display_names.get(xref_id):
                 if verbose:
                     logging.info(f"Removing {xref_id_to_display[xref_id]} from gene")
                 self.update_object_xref_status(
-                    object_xref_list[xref_id], "MULTI_DELETE", dbi
+                    object_xref_list[index], "MULTI_DELETE", dbi
                 )
             else:
                 if verbose:
diff --git a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
index c95ee7716..8e9ba37d8 100644
--- a/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
+++ b/src/python/ensembl/production/xrefs/mappers/XrefLoader.py
@@ -93,7 +93,7 @@ def update(self, species_name: str) -> None:
         )
 
         # Delete existing xrefs in core DB (only from relevant sources)
-        self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi)
+        self.deleted_existing_xrefs(name_to_external_db_id, xref_dbi, core_dbi)
 
         # Get the offsets for xref and object_xref tables
         xref_offset = core_dbi.execute(select(func.max(XrefCORM.xref_id))).scalar() or 0
@@ -498,7 +498,7 @@ def delete_projection_data(self, dbi: Connection) -> None:
             f"Deleted all PROJECTIONs rows: {counts['external_synonym']} external_synonyms, {counts['dependent_xref']} dependent_xrefs, {counts['object_xref']} object_xrefs, {counts['xref']} xrefs"
         )
 
-    def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection) -> None:
+    def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_dbi: Connection, core_dbi: Connection) -> None:
         # For each external_db to be updated, delete the existing xrefs
         query = (
             select(SourceUORM.name, func.count(XrefUORM.xref_id).label("count"))
@@ -528,92 +528,87 @@ def deleted_existing_xrefs(self, name_to_external_db_id: Dict[str, int], xref_db
 
             logging.info(f"For source '{name}'")
 
-            Session = sessionmaker(bind=self.core().execution_options(isolation_level="READ COMMITTED"))
-            with Session.begin() as session:
-                try:
-                    counts["gene"] = session.execute(
-                        update(GeneORM)
-                        .values(display_xref_id=None, description=None)
-                        .where(
-                            GeneORM.display_xref_id == XrefCORM.xref_id,
-                            XrefCORM.external_db_id == external_db_id,
-                        )
-                    ).rowcount
-                    logging.info(
-                        f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)"
+            try:
+                counts["gene"] = core_dbi.execute(
+                    update(GeneORM)
+                    .values(display_xref_id=None, description=None)
+                    .where(
+                        GeneORM.display_xref_id == XrefCORM.xref_id,
+                        XrefCORM.external_db_id == external_db_id,
                     )
+                ).rowcount
+                logging.info(
+                    f"\tSet display_xref_id=NULL and description=NULL for {counts['gene']} gene row(s)"
+                )
 
-                    counts["external_synonym"] = session.execute(
-                        delete(ExternalSynonymORM).where(
-                            ExternalSynonymORM.xref_id == XrefCORM.xref_id,
-                            XrefCORM.external_db_id == external_db_id,
-                        )
-                    ).rowcount
-                    counts["identity_xref"] = session.execute(
-                        delete(IdentityXrefCORM).where(
-                            IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id,
-                            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
-                            XrefCORM.external_db_id == external_db_id,
-                        )
-                    ).rowcount
-                    counts["object_xref"] = session.execute(
-                        delete(ObjectXrefCORM).where(
-                            ObjectXrefCORM.xref_id == XrefCORM.xref_id,
-                            XrefCORM.external_db_id == external_db_id,
-                        )
-                    ).rowcount
-
-                    MasterXref = aliased(XrefCORM)
-                    DependentXref = aliased(XrefCORM)
-
-                    query = select(
-                        ObjectXrefCORM.object_xref_id,
-                        DependentXrefCORM.master_xref_id,
-                        DependentXrefCORM.dependent_xref_id,
-                    ).where(
-                        ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id,
-                        MasterXref.xref_id == DependentXrefCORM.master_xref_id,
-                        DependentXref.xref_id == DependentXrefCORM.dependent_xref_id,
-                        MasterXref.external_db_id == external_db_id,
+                counts["external_synonym"] = core_dbi.execute(
+                    delete(ExternalSynonymORM).where(
+                        ExternalSynonymORM.xref_id == XrefCORM.xref_id,
+                        XrefCORM.external_db_id == external_db_id,
                     )
-                    for sub_row in session.execute(query).mappings().all():
-                        counts["master_dependent_xref"] += session.execute(
-                            delete(DependentXrefCORM).where(
-                                DependentXrefCORM.master_xref_id == sub_row.master_xref_id,
-                                DependentXrefCORM.dependent_xref_id == sub_row.dependent_xref_id,
-                            )
-                        ).rowcount
-                        counts["master_object_xref"] += session.execute(
-                            delete(ObjectXrefCORM).where(
-                                ObjectXrefCORM.object_xref_id == sub_row.object_xref_id
-                            )
-                        ).rowcount
-
-                    counts["dependent_xref"] = session.execute(
+                ).rowcount
+                counts["identity_xref"] = core_dbi.execute(
+                    delete(IdentityXrefCORM).where(
+                        IdentityXrefCORM.object_xref_id == ObjectXrefCORM.object_xref_id,
+                        ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+                        XrefCORM.external_db_id == external_db_id,
+                    )
+                ).rowcount
+                counts["object_xref"] = core_dbi.execute(
+                    delete(ObjectXrefCORM).where(
+                        ObjectXrefCORM.xref_id == XrefCORM.xref_id,
+                        XrefCORM.external_db_id == external_db_id,
+                    )
+                ).rowcount
+
+                MasterXref = aliased(XrefCORM)
+                DependentXref = aliased(XrefCORM)
+
+                query = select(
+                    ObjectXrefCORM.object_xref_id,
+                    DependentXrefCORM.master_xref_id,
+                    DependentXrefCORM.dependent_xref_id,
+                ).where(
+                    ObjectXrefCORM.object_xref_id == DependentXrefCORM.object_xref_id,
+                    MasterXref.xref_id == DependentXrefCORM.master_xref_id,
+                    DependentXref.xref_id == DependentXrefCORM.dependent_xref_id,
+                    MasterXref.external_db_id == external_db_id,
+                )
+                for sub_row in core_dbi.execute(query).mappings().all():
+                    counts["master_dependent_xref"] += core_dbi.execute(
                         delete(DependentXrefCORM).where(
-                            DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id,
-                            XrefCORM.external_db_id == external_db_id,
+                            DependentXrefCORM.master_xref_id == sub_row.master_xref_id,
+                            DependentXrefCORM.dependent_xref_id == sub_row.dependent_xref_id,
                         )
                     ).rowcount
-                    counts["xref"] = session.execute(
-                        delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id)
-                    ).rowcount
-                    counts["unmapped_object"] = session.execute(
-                        delete(UnmappedObjectORM).where(
-                            UnmappedObjectORM.unmapped_object_type == "xref",
-                            UnmappedObjectORM.external_db_id == external_db_id,
+                    counts["master_object_xref"] += core_dbi.execute(
+                        delete(ObjectXrefCORM).where(
+                            ObjectXrefCORM.object_xref_id == sub_row.object_xref_id
                         )
                     ).rowcount
 
-                    logging.info(
-                        f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects"
+                counts["dependent_xref"] = core_dbi.execute(
+                    delete(DependentXrefCORM).where(
+                        DependentXrefCORM.dependent_xref_id == XrefCORM.xref_id,
+                        XrefCORM.external_db_id == external_db_id,
+                    )
+                ).rowcount
+                counts["xref"] = core_dbi.execute(
+                    delete(XrefCORM).where(XrefCORM.external_db_id == external_db_id)
+                ).rowcount
+                counts["unmapped_object"] = core_dbi.execute(
+                    delete(UnmappedObjectORM).where(
+                        UnmappedObjectORM.unmapped_object_type == "xref",
+                        UnmappedObjectORM.external_db_id == external_db_id,
                     )
+                ).rowcount
 
-                    session.commit()
-                except SQLAlchemyError as e:
-                    session.rollback()
-                    logging.error(f"Failed to delete rows for source '{name}': {e}")
-                    raise RuntimeError(f"Transaction failed for source '{name}'")
+                logging.info(
+                    f"\tDeleted rows: {counts['external_synonym']} external_synonyms, {counts['identity_xref']} identity_xrefs, {counts['object_xref']} object_xrefs, {counts['master_dependent_xref']} master dependent_xrefs, {counts['master_object_xref']} master object_xrefs, {counts['dependent_xref']} dependent_xrefs, {counts['xref']} xrefs, {counts['unmapped_object']} unmapped_objects"
+                )
+            except SQLAlchemyError as e:
+                logging.error(f"Failed to delete existing rows for source '{name}': {e}")
+                raise RuntimeError(f"Failed to delete existing rows for source '{name}': {e}")
 
     def get_analysis(self, dbi: Connection) -> Dict[str, int]:
         analysis_ids = {}
diff --git a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
index d64eead15..21df867fe 100644
--- a/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/HGNCParser.py
@@ -24,7 +24,7 @@
 from sqlalchemy import select
 from sqlalchemy.engine import Connection
 from sqlalchemy.engine.url import URL
-from unidecode import unidecode # type: ignore
+from unidecode import unidecode
 
 from ensembl.core.models import (
     Transcript as TranscriptORM,
diff --git a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
index 4f13abd3e..7988534fd 100644
--- a/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RFAMParser.py
@@ -17,7 +17,7 @@
 import logging
 import os
 import re
-import wget # type: ignore
+import wget
 from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 from sqlalchemy import and_, select
diff --git a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
index 699a0af83..a7ce38729 100644
--- a/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
+++ b/src/python/ensembl/production/xrefs/parsers/RefSeqCoordinateParser.py
@@ -14,6 +14,7 @@
 
 """Parser module for RefSeq coordinate xrefs."""
 
+import os
 import json
 import logging
 import subprocess
@@ -47,7 +48,7 @@ def run(self, args: Dict[str, Any]) -> Tuple[int, str]:
 
         # Connect to the appropriate dbs
         if db_url:
-            return self.run_perl_script(args, source_ids, species_name)
+            return self.run_perl_script(args, source_ids, species_id, species_name)
         else:
             # Not all species have an otherfeatures database, skip if not found
             return 0, f"Skipped. No otherfeatures database for '{species_name}'."
@@ -74,7 +75,7 @@ def get_source_ids(self, verbose: bool, xref_dbi: Connection) -> Dict[str, int]:
         
         return source_ids
 
-    def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], species_name: str) -> Tuple[int, str]:
+    def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], species_id: int, species_name: str) -> Tuple[int, str]:
         # For now, we run a perl script to add the xrefs, which has some mandatory arguments
         scripts_dir = args.get("perl_scripts_dir")
         xref_db_url = args.get("xref_db_url")
@@ -83,22 +84,22 @@ def run_perl_script(self, args: Dict[str, Any], source_ids: Dict[str, int], spec
 
         source_ids_json = json.dumps(source_ids)
 
-        logging.info(f"Running perl script {scripts_dir}/refseq_coordinate_parser.pl")
-        perl_cmd = (
-            "perl",
-            f"{scripts_dir}/refseq_coordinate_parser.pl"
-            f"--xref_db_url", xref_db_url
-            f"--core_db_url", args.get('core_db_url'),
-            f"--otherf_db_url", args.get('extra_db_url'),
-            f"--source_ids", source_ids_json,
-            f"--species_id", str(species_id),
-            f"--species_name", species_name
-            f"--release", str(args.get('ensembl_release'))
-        )
-        cmd_output = subprocess.run(perl_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        parser_script = os.path.join(scripts_dir, 'refseq_coordinate_parser.pl')
+        logging.info(f"Running perl script {parser_script}")
+        perl_cmd = [
+            "perl", parser_script,
+            "--xref_db_url", xref_db_url,
+            "--core_db_url", args.get('core_db_url'),
+            "--otherf_db_url", args.get('extra_db_url'),
+            "--source_ids", source_ids_json,
+            "--species_id", str(species_id),
+            "--species_name", species_name,
+            "--release", str(args.get('ensembl_release'))
+        ]
+        cmd_output = subprocess.run(perl_cmd, capture_output=True, text=True)
 
         if cmd_output.returncode != 0:
-            logging.error(f"Perl script ({scripts_dir}/refseq_coordinate_parser.pl) failed with error: {cmd_output.stderr.decode('utf-8')}")
+            logging.error(f"Perl script ({scripts_dir}/refseq_coordinate_parser.pl) failed with error: {cmd_output.stderr}")
             return 1, "Failed to add refseq_import xrefs."
 
         return 0, "Added refseq_import xrefs."
diff --git a/src/python/test/xrefs/flatfiles/sources_download.json b/src/python/test/xrefs/flatfiles/sources_download.json
new file mode 100644
index 000000000..1b45a2acb
--- /dev/null
+++ b/src/python/test/xrefs/flatfiles/sources_download.json
@@ -0,0 +1,16 @@
+[
+    {
+      "name" : "ArrayExpress",
+      "parser" : "ArrayExpressParser",
+      "file" : "Database",
+      "db" : "core",
+      "priority" : 1
+    },
+    {
+      "name" : "RNACentral",
+      "parser" : "ChecksumParser",
+      "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz",
+      "db" : "checksum",
+      "priority" : 1
+    }
+]
\ No newline at end of file

From 833158748d708a6f354275a7f576b03587496980 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-02.ebi.ac.uk>
Date: Mon, 13 Jan 2025 10:12:42 +0000
Subject: [PATCH 09/12] More fixes

---
 nextflow/config/xref.config                       |  6 +++++-
 src/python/ensembl/production/xrefs/Base.py       | 15 +++++++++++++--
 .../production/xrefs/mappers/CoordinateMapper.py  |  1 +
 .../production/xrefs/mappers/DisplayXrefs.py      |  5 +++--
 4 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config
index 2518e806e..2d3047bf3 100644
--- a/nextflow/config/xref.config
+++ b/nextflow/config/xref.config
@@ -14,8 +14,8 @@ params.sources_config_file = "${params.work_dir}/ensembl-production/src/python/e
 params.source_db_url = ''
 params.skip_download = 0
 params.reuse_db = 0
-params.split_files_by_species = 1
 params.tax_ids_file = ''
+params.tax_ids_list = ''
 params.update_mode = 0
 
 params.base_path = ''
@@ -69,6 +69,10 @@ profiles {
                 memory = 1.GB
             }
 
+            withLabel: mem10GB {
+                memory = 10.GB
+            }
+
             withLabel:cleanup_mem {
                 memory = 4.GB
                 errorStrategy = 'retry'
diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py
index 53a04f2d2..00972493a 100644
--- a/src/python/ensembl/production/xrefs/Base.py
+++ b/src/python/ensembl/production/xrefs/Base.py
@@ -892,8 +892,19 @@ def get_xref_mapper(self, xref_url: str, species: str, base_path: str, release:
         if not core_url:
             core_url = self.get_db_from_registry(species, "core", release, registry)
 
-        core_db = self.get_db_engine(core_url)
-        xref_db = self.get_db_engine(xref_url)
+        core_db = create_engine(
+            make_url(core_url),
+            isolation_level="AUTOCOMMIT",
+            pool_recycle=18000,
+            pool_pre_ping=True
+        )
+
+        xref_db = create_engine(
+            make_url(xref_url),
+            isolation_level="AUTOCOMMIT",
+            pool_recycle=18000,
+            pool_pre_ping=True
+        )
 
         # Extract host and dbname from xref URL
         xref_url_obj = make_url(xref_url)
diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
index e8aac3f77..35ee7ac80 100644
--- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
@@ -14,6 +14,7 @@
 
 """Mapper module for processing coordinate xref data."""
 
+import os
 import subprocess
 import logging
 from datetime import datetime
diff --git a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
index 964eb26cd..8bcca8a83 100644
--- a/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
+++ b/src/python/ensembl/production/xrefs/mappers/DisplayXrefs.py
@@ -18,10 +18,11 @@
 import re
 from datetime import datetime
 from typing import Dict, List, Tuple
-from sqlalchemy import select, insert, update, delete, case, desc, func
+from sqlalchemy import select, update, delete, case, desc, func
 from sqlalchemy.orm import aliased
 from sqlalchemy.engine import Connection
 from sqlalchemy.sql import Select
+from sqlalchemy.dialects.mysql import insert
 
 from ensembl.core.models import (
     Gene as GeneORM,
@@ -127,7 +128,7 @@ def set_display_xrefs(self, set_transcript_display_xrefs: bool) -> None:
                             ensembl_object_type=object_type,
                             source_id=row.source_id,
                             priority=priority,
-                        )
+                        ).on_duplicate_key_update(priority=priority)
                     )
                     logging.info(f"{priority} - {row.name}")
 

From f7cf53aeaf6fff2b9269f91382ee19e3338d4a73 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-02.ebi.ac.uk>
Date: Mon, 13 Jan 2025 10:13:37 +0000
Subject: [PATCH 10/12] Uniprot & Refseq are always split by species

---
 nextflow/workflows/xrefDownload.nf            | 57 ++++--------------
 nextflow/workflows/xrefProcess.nf             | 58 +++++--------------
 scripts/xrefs/cleanup_and_split_source.pl     | 16 +++--
 .../ensembl/production/xrefs/ScheduleParse.py | 27 ++++-----
 4 files changed, 52 insertions(+), 106 deletions(-)

diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf
index 8034627ed..66dd4ee98 100644
--- a/nextflow/workflows/xrefDownload.nf
+++ b/nextflow/workflows/xrefDownload.nf
@@ -4,12 +4,6 @@
 params.pipeline_name = 'Xref Download Pipeline'
 params.help = false
 
-// Ensure all paths are absolute
-params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString()
-params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString()
-params.base_path = file(params.base_path).toAbsolutePath().toString()
-params.clean_dir = file(params.clean_dir).toAbsolutePath().toString()
-
 println """\
         XREF DOWNLOAD PIPELINE
         ======================
@@ -18,10 +12,10 @@ println """\
         reuse_db                  : ${params.reuse_db}
         skip_download             : ${params.skip_download}
         clean_files               : ${params.clean_files}
-        split_files_by_species    : ${params.split_files_by_species}
         config_file               : ${params.config_file}
         clean_dir                 : ${params.clean_dir}
         tax_ids_file              : ${params.tax_ids_file}
+        tax_ids_list              : ${params.tax_ids_list}
         update_mode               : ${params.update_mode}
         """
         .stripIndent()
@@ -45,9 +39,6 @@ def helpMessage() {
         --clean_files               (optional)      If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files.
                                                     Default: 1
 
-        --split_files_by_species    (optional)      If set to 1, UniProt and RefSeq file will be split according to taxonomy ID.
-                                                    Default: 1
-
         --config_file               (optional)      Path to the json file containing information about xref sources to download.
                                                     Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json
 
@@ -55,10 +46,13 @@ def helpMessage() {
                                                     Default: [--base_path]/clean_files
 
         --tax_ids_file              (optional)      Path to the file containing the taxonomy IDs of the species to extract data for.
-                                                    Used to update the data for the provided species.
+                                                    Each taxonomy ID on a line.
+        
+        --tax_ids_list              (optional)      List of taxonomy IDs of the species to extract data for, separated by commas.
+                                                    Takes precedence over --tax_ids_file.
 
         --update_mode               (optional)      If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs.
-                                                    Only used if --tax_ids_file is set. Default: 0
+                                                    Only used if --tax_ids_file or --tax_ids_list are set. Default: 0
     """.stripIndent()
 }
 
@@ -89,13 +83,8 @@ workflow {
     ScheduleCleanup(CleanupTmpFiles.out, timestamp)
 
     Checksum(ScheduleCleanup.out[0], timestamp)
-    if (params.split_files_by_species) {
-        CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
-        NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp)
-    } else {
-        CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
-        NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp)
-    }
+    CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp)
+    NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp)
 }
 
 process ScheduleDownload {
@@ -158,7 +147,7 @@ process ScheduleCleanup {
     path 'dataflow_cleanup_sources.json'
 
     """
-    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --log_timestamp $timestamp
     """
 }
 
@@ -195,7 +184,9 @@ process CleanupSplitSource {
         version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1]
         cmd_params = "${cmd_params} --version_file '${version_file}'"
     }
-    if (params.tax_ids_file) {
+    if (params.tax_ids_list) {
+        cmd_params = "${cmd_params} --tax_ids_list ${params.tax_ids_list}"
+    } else if (params.tax_ids_file) {
         cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}"
     }
 
@@ -204,30 +195,6 @@ process CleanupSplitSource {
     """
 }
 
-process CleanupSource {
-    label 'cleanup_mem'
-    tag "$src_name"
-
-    input:
-    val x
-    val timestamp
-
-    output:
-    val 'CleanupDone'
-
-    shell:
-    cmd_params = ""
-    src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1]
-    if (x =~ /"version_file":/) {
-        version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1]
-        cmd_params = "${cmd_params} --version_file '${version_file}'"
-    }
-
-    """
-    perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params
-    """
-}
-
 process NotifyByEmail {
     label 'small_process'
 
diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf
index 8ae1d8c19..58788f082 100644
--- a/nextflow/workflows/xrefProcess.nf
+++ b/nextflow/workflows/xrefProcess.nf
@@ -4,11 +4,6 @@
 params.pipeline_name = 'Xref Process Pipeline'
 params.help = false
 
-// Ensure all paths are absolute
-params.scripts_dir = file(params.scripts_dir).toAbsolutePath().toString()
-params.perl_scripts_dir = file(params.perl_scripts_dir).toAbsolutePath().toString()
-params.base_path = file(params.base_path).toAbsolutePath().toString()
-
 println """\
         XREF PROCESS PIPELINE
         ======================
@@ -21,7 +16,6 @@ println """\
         species                   : ${params.species}
         antispecies               : ${params.antispecies}
         division                  : ${params.division}
-        split_files_by_species    : ${params.split_files_by_species}
         sources_config_file       : ${params.sources_config_file}
         registry_file             : ${params.registry_file}
         dc_config_file            : ${params.dc_config_file}
@@ -57,9 +51,6 @@ def helpMessage() {
     --division                  (optional)      Comma-separated list of divisions to run pipeline on.
                                                 Will be disregarded if --run_all is set to 1.
 
-    --split_files_by_species    (optional)      If set to 1, UniProt and RefSeq file will be split according to taxonomy ID.
-                                                Default: 1
-
     --sources_config_file       (optional)      Path to the ini file containing information about all xref sources and species/divisions.
                                                 Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini
 
@@ -175,7 +166,10 @@ workflow species_flow {
         // Run datachecks
         RunXrefCriticalDatacheck(Mapping.out)
         RunXrefAdvisoryDatacheck(RunXrefCriticalDatacheck.out)
-        advisory_report_ch = process_output(RunXrefAdvisoryDatacheck.out)
+
+	    dataflow_combined = RunXrefAdvisoryDatacheck.out.dataflow_success
+            .mix(RunXrefAdvisoryDatacheck.out.dataflow_fail)
+        advisory_report_ch = process_output(dataflow_combined)
 
         // Collect advisory datacheck outputs
         AdvisoryXrefReport(advisory_report_ch, timestamp)
@@ -248,14 +242,8 @@ process ScheduleParse {
     tuple val(species_name), path('dataflow_primary_sources.json')
     tuple val(species_name), path('dataflow_schedule_secondary.json')
 
-    shell:
-    cmd_params = ""
-    if (params.split_files_by_species) {
-        cmd_params = "${cmd_params} --get_species_file 1"
-    }
-
     """
-    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 1 --sources_config_file ${params.sources_config_file} --source_db_url ${params.source_db_url} --xref_db_url ${params.xref_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 1 --sources_config_file ${params.sources_config_file} --source_db_url ${params.source_db_url} --xref_db_url ${params.xref_db_url} --base_path ${params.base_path} --log_timestamp $timestamp
     """
 }
 
@@ -291,14 +279,8 @@ process ScheduleSecondaryParse {
     tuple val(species_name), path('dataflow_secondary_sources.json')
     tuple val(species_name), path('dataflow_schedule_tertiary.json')
 
-    shell:
-    cmd_params = ""
-    if (params.split_files_by_species) {
-        cmd_params = "${cmd_params} --get_species_file 1"
-    }
-
     """
-    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 2 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 2 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp
     """
 }
 
@@ -334,14 +316,8 @@ process ScheduleTertiaryParse {
     tuple val(species_name), path('dataflow_tertiary_sources.json')
     tuple val(species_name), path('dataflow_dump_ensembl.json')
 
-    shell:
-    cmd_params = ""
-    if (params.split_files_by_species) {
-        cmd_params = "${cmd_params} --get_species_file 1"
-    }
-
     """
-    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 3 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp $cmd_params
+    python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleParse --dataflow '$dataflow' --release ${params.release} --registry_url ${params.registry_url} --priority 3 --source_db_url ${params.source_db_url} --base_path ${params.base_path} --log_timestamp $timestamp
     """
 }
 
@@ -365,7 +341,7 @@ process ParseTertiarySource {
 }
 
 process DumpEnsembl {
-    label 'default_process'
+    label 'mem10GB'
     tag "$species_name"
 
     input:
@@ -386,7 +362,7 @@ process DumpEnsembl {
 }
 
 process DumpXref {
-    label 'mem1GB'
+    label 'mem4GB'
     tag "$species_name"
 
     input:
@@ -439,7 +415,7 @@ process Alignment {
 }
 
 process ScheduleMapping {
-    label 'small_process'
+    label 'mem1GB'
     tag "$species_name"
 
     input:
@@ -564,7 +540,7 @@ process RunXrefCriticalDatacheck {
     val species_name
 
     """
-    perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_names='ForeignKeys' -datacheck_groups='xref_mapping' -datacheck_types='critical' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=1 -species=$species_name
+    perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_names='ForeignKeys' -datacheck_groups='xref_mapping' -datacheck_types='critical' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -failures_fatal=1 -species=$species_name
     """
 }
 
@@ -576,16 +552,17 @@ process RunXrefAdvisoryDatacheck {
     val species_name
 
     output:
-    tuple val(species_name), path('dataflow_4.json')
+    tuple val(species_name), path('dataflow_3.json'), emit: dataflow_success, optional: true
+    tuple val(species_name), path('dataflow_4.json'), emit: dataflow_fail, optional: true
 
     """
-    perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_groups='xref_mapping' -datacheck_types='advisory' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -history_file='${params.history_file}' -old_server_uri='${params.old_server_uri}' -failures_fatal=0 -species=$species_name
+    perl ${params.perl_scripts_dir}/run_process.pl -class='Nextflow::RunDataChecks' -datacheck_groups='xref_mapping' -datacheck_types='advisory' -registry_file=${params.registry_file} -config_file=${params.dc_config_file} -failures_fatal=0 -species=$species_name
     """
 }
 
 process AdvisoryXrefReport {
     label 'default_process'
-    tag "$species_name - $dc_name"
+    tag "$species_name"
 
     input:
     tuple val(species_name), val(dataflow)
@@ -594,9 +571,6 @@ process AdvisoryXrefReport {
     output:
     val species_name
 
-    shell:
-    dc_name = (dataflow =~ /"datacheck_name":\s*"([A-Za-z]+)"/)[0][1]
-
     script:
     formatted_dataflow = dataflow.replace("'", '__')
     """
@@ -629,4 +603,4 @@ process NotifyByEmail {
     """
     python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp
     """
-}
\ No newline at end of file
+}
diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl
index 0b956a31d..f1e09dde6 100644
--- a/scripts/xrefs/cleanup_and_split_source.pl
+++ b/scripts/xrefs/cleanup_and_split_source.pl
@@ -28,7 +28,7 @@
 
 use Nextflow::Utils;
 
-my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $update_mode, $log_timestamp);
+my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $tax_ids_list, $update_mode, $log_timestamp);
 GetOptions(
   'base_path=s'     => \$base_path,
   'source_db_url=s' => \$source_db_url,
@@ -37,6 +37,7 @@
   'clean_files=i'   => \$clean_files,
   'version_file:s'  => \$version_file,
   'tax_ids_file:s'  => \$tax_ids_file,
+  'tax_ids_list:s'  => \$tax_ids_list,
   'update_mode:i'   => \$update_mode,
   'log_timestamp:s' => \$log_timestamp
 );
@@ -56,6 +57,7 @@
 
   add_to_log_file($log_file, "CleanupSplitSource starting for source $source_name");
   add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file") if $tax_ids_file;
+  add_to_log_file($log_file, "Param: tax_ids_list = $tax_ids_list") if $tax_ids_list;
 }
 
 # Do nothing if not a uniprot or refseq source
@@ -100,13 +102,19 @@
 
 # Extract taxonomy IDs
 my %tax_ids;
-my ($skipped_species, $added_species) = (0, 0);
-if ($tax_ids_file && $update_mode) {
+if ($tax_ids_list) {
+  $tax_ids_list =~ s/\s*,\s*/,/g;
+  %tax_ids = map { $_ => 1} split(",", $tax_ids_list);
+} elsif ($tax_ids_file) {
   open my $fh, '<', $tax_ids_file or die "Couldn't open tax_ids_file '$tax_ids_file' $!";
   chomp(my @lines = <$fh>);
   close $fh;
   %tax_ids = map { $_ => 1 } @lines;
+}
 
+my $tax_ids_filter = ($tax_ids_file || $tax_ids_list ? 1 : 0);
+my ($skipped_species, $added_species) = (0, 0);
+if ($tax_ids_filter && $update_mode) {
   # Check if any taxonomy IDs already have files
   foreach my $tax_id (keys %tax_ids) {
     my @tax_files = glob(catfile($output_path, "**", "**", "**", "**", "$output_file_name-$tax_id"));
@@ -165,7 +173,7 @@
 
       # Only continue with wanted species
       next unless $species_id;
-      next if $tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id});
+      next if $tax_ids_filter && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id});
 
       # Clean up data
       if ($clean_files) {
diff --git a/src/python/ensembl/production/xrefs/ScheduleParse.py b/src/python/ensembl/production/xrefs/ScheduleParse.py
index 2317025f3..c9106b672 100644
--- a/src/python/ensembl/production/xrefs/ScheduleParse.py
+++ b/src/python/ensembl/production/xrefs/ScheduleParse.py
@@ -37,7 +37,6 @@ def run(self):
         order_priority: int = self.get_param("priority", {"required": True, "type": int})
         source_db_url: str = self.get_param("source_db_url", {"required": True, "type": str})
         xref_db_url: str = self.get_param("xref_db_url", {"required": True, "type": str})
-        get_species_file: bool = self.get_param("get_species_file", {"required": True, "type": bool})
         core_db_url: Optional[str] = self.get_param("species_db", {"type": str})
 
         logging.info(f"ScheduleParse starting for species '{species_name}'")
@@ -194,18 +193,17 @@ def run(self):
                 )
 
                 # For Uniprot and Refseq, files might have been split by species
-                if get_species_file:
-                    file_prefix = {
-                        "Uniprot/SWISSPROT": "uniprot_sprot",
-                        "Uniprot/SPTREMBL": "uniprot_trembl",
-                        "RefSeq_dna": "refseq_rna",
-                        "RefSeq_peptide": "refseq_protein",
-                    }.get(source.name)
-
-                    if file_prefix:
-                        list_files = glob.glob(
-                            f"{file_name}/**/{file_prefix}-{species_id}", recursive=True
-                        )
+                file_prefix = {
+                    "Uniprot/SWISSPROT": "uniprot_sprot",
+                    "Uniprot/SPTREMBL": "uniprot_trembl",
+                    "RefSeq_dna": "refseq_rna",
+                    "RefSeq_peptide": "refseq_protein",
+                }.get(source.name)
+
+                if file_prefix:
+                    list_files = glob.glob(
+                        f"{file_name}/**/{file_prefix}-{species_id}", recursive=True
+                    )
 
                 if source.name == "ZFIN_ID":
                     list_files = [list_files[0]]
@@ -218,8 +216,7 @@ def run(self):
                     dataflow_params["file_name"] = file
 
                     if re.search(r"^Uniprot", source.name) and hgnc_path:
-                        hgnc_files = glob.glob(os.path.join(hgnc_path, "*"))
-                        dataflow_params["hgnc_file"] = hgnc_files[0]
+                        dataflow_params["hgnc_file"] = hgnc_path
 
                     self.write_output(dataflow_suffix, dataflow_params)
                     total_sources += 1

From 95b107a12c28932f8f719b7b258f7fe485742191 Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-02.ebi.ac.uk>
Date: Mon, 13 Jan 2025 11:25:11 +0000
Subject: [PATCH 11/12] Add --config_file parameter to list of params

---
 nextflow/workflows/xrefProcess.nf | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nextflow/workflows/xrefProcess.nf b/nextflow/workflows/xrefProcess.nf
index 58788f082..3136fc4c8 100644
--- a/nextflow/workflows/xrefProcess.nf
+++ b/nextflow/workflows/xrefProcess.nf
@@ -16,6 +16,7 @@ println """\
         species                   : ${params.species}
         antispecies               : ${params.antispecies}
         division                  : ${params.division}
+        config_file               : ${params.config_file}
         sources_config_file       : ${params.sources_config_file}
         registry_file             : ${params.registry_file}
         dc_config_file            : ${params.dc_config_file}
@@ -51,6 +52,9 @@ def helpMessage() {
     --division                  (optional)      Comma-separated list of divisions to run pipeline on.
                                                 Will be disregarded if --run_all is set to 1.
 
+    --config_file               (optional)      Path to the json file containing information about xref sources to download.
+                                                Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json
+
     --sources_config_file       (optional)      Path to the ini file containing information about all xref sources and species/divisions.
                                                 Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini
 

From 5ba71f5d470465c8e94f6dad6d36ad2bbf860f3f Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@codon-slurm-login-02.ebi.ac.uk>
Date: Tue, 21 Jan 2025 14:38:00 +0000
Subject: [PATCH 12/12] Minor email notification update + coordinate mapper fix

---
 .../ensembl/production/xrefs/Checksum.py      |  6 +-
 .../production/xrefs/EmailNotification.py     | 58 +++++++++++++------
 .../production/xrefs/ScheduleCleanup.py       |  8 +--
 .../production/xrefs/ScheduleDownload.py      |  6 +-
 .../xrefs/mappers/CoordinateMapper.py         |  4 +-
 5 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py
index 2d990cf70..20aef07ab 100644
--- a/src/python/ensembl/production/xrefs/Checksum.py
+++ b/src/python/ensembl/production/xrefs/Checksum.py
@@ -28,9 +28,9 @@ def run(self):
         skip_download: bool = self.get_param("skip_download", {"required": True, "type": bool})
 
         logging.info("Checksum starting with parameters:")
-        logging.info(f"Param: base_path = {base_path}")
-        logging.info(f"Param: source_db_url = {source_db_url}")
-        logging.info(f"Param: skip_download = {skip_download}")
+        logging.info(f"\tParam: base_path = {base_path}")
+        logging.info(f"\tParam: source_db_url = {source_db_url}")
+        logging.info(f"\tParam: skip_download = {skip_download}")
 
         # Connect to source db
         db_engine = self.get_db_engine(source_db_url)
diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py
index 932b0c1b7..0acae8b6d 100644
--- a/src/python/ensembl/production/xrefs/EmailNotification.py
+++ b/src/python/ensembl/production/xrefs/EmailNotification.py
@@ -58,8 +58,8 @@ def run(self):
                     sources_data, added_species, skipped_species = self.extract_download_statistics(data)
                     email_message += self.format_download_statistics(sources_data, added_species, skipped_species)
                 elif re.search("Process", pipeline_name):
-                    parsed_sources, species_counts = self.extract_process_statistics(data)
-                    email_message += self.format_process_statistics(parsed_sources, species_counts)
+                    parsed_sources, absolute_sources, species_counts = self.extract_process_statistics(data)
+                    email_message += self.format_process_statistics(parsed_sources, species_counts, absolute_sources)
 
         # Send email
         self.send_email(email_address, email_server, pipeline_name, email_message)
@@ -117,8 +117,8 @@ def combine_logs(self, base_path: str, timestamp: str, type: str) -> str:
         return main_log_file
 
     def extract_parameters(self, data: str) -> Dict[str, str]:
-        parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data)
-        return {param[0]: param[1] for param in parameters_list}
+        parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| \tParam: (\w+) = (.*)", data)
+        return {param[0]: param[1] for param in parameters_list if param[0] != 'order_priority'}
 
     def format_parameters(self, parameters: Dict[str, str]) -> str:
         message = "<br><h5>Run Parameters</h5>"
@@ -217,14 +217,15 @@ def format_download_statistics(self, sources_data: Dict[str, Dict[str, Any]], ad
 
         return message
 
-    def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Dict[str, int]]]:
-        parsed_sources = self.extract_parsed_sources(data)
+    def extract_process_statistics(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, bool], Dict[str, Dict[str, int]]]:
+        parsed_sources, absolute_sources = self.extract_parsed_sources(data)
         species_counts = self.extract_species_counts(data)
 
-        return parsed_sources, species_counts
+        return parsed_sources, absolute_sources, species_counts
 
-    def extract_parsed_sources(self, data: str) -> Dict[str, Dict[str, str]]:
+    def extract_parsed_sources(self, data: str) -> Tuple[Dict[str, Dict[str, str]], Dict[str, bool]]:
         parsed_sources = {}
+        absolute_sources = {}
 
         matches_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ParseSource starting for source '([\w\/]+)' with parser '([\w\/]+)' for species '([\w\/]+)'", data)
         for species in matches_list:
@@ -232,8 +233,9 @@ def extract_parsed_sources(self, data: str) -> Dict[str, Dict[str, str]]:
             if species_name not in parsed_sources:
                 parsed_sources[species_name] = {}
             parsed_sources[species_name][source_name] = parser
+            absolute_sources[source_name] = True
 
-        return parsed_sources
+        return parsed_sources, absolute_sources
 
     def extract_species_counts(self, data: str) -> Dict[str, Dict[str, int]]:
         species_counts = {}
@@ -258,18 +260,36 @@ def extract_species_counts(self, data: str) -> Dict[str, Dict[str, int]]:
 
         return species_counts
 
-    def format_process_statistics(self, parsed_sources: Dict[str, Dict[str, str]], species_counts: Dict[str, Dict[str, int]]) -> str:
-        message = "<br>--Species Statistics--<br>"
+    def format_process_statistics(self, parsed_sources: Dict[str, Dict[str, str]], species_counts: Dict[str, Dict[str, int]], absolute_sources: Dict[str, bool]) -> str:
+        cell_style = 'style="border-right: 1px solid #000; padding: 5px;"'
+
+        message = "<br><h5>Source Statistics</h5>"
+        message += f"<table style=\"border: 1px solid #000;\"><tr style=\"border-bottom: 1px solid #000;\"><th {cell_style}>Species</th>"
+        for source_name in sorted(absolute_sources):
+            message += f"<th {cell_style}>{source_name}</th>"
+        message += f"</tr>"
 
         for species_name, species_data in parsed_sources.items():
-            message += f"<b>{species_name}:</b><br>"
-            message += f"{self.INDENT}Sources parsed: " + ",".join(species_data.keys()) + "<br>"
-
-            xref_counts = species_counts[species_name]
-            message += f"{self.INDENT}Xrefs added: "
-            for xref_type, count in xref_counts.items():
-                message += f"{count} {xref_type} "
-            message += "<br>"
+            message += f"<tr><td {cell_style}>{species_name}</td>"
+            for source_name in sorted(absolute_sources):
+                message += f"<td {cell_style}>X</td>" if source_name in species_data else f"<td {cell_style}></td>"
+            message += "</tr>"
+        message += "</table>"
+
+        message += "<br><h5>Xref Data Statistics</h5>"
+        message += f"<table style=\"border: 1px solid #000;\"><tr style=\"border-bottom: 1px solid #000;\"><th {cell_style}>Species</th><th {cell_style}>DIRECT</th><th {cell_style}>DEPENDENT</th>"
+        message += f"<th {cell_style}>INFERRED_PAIR</th><th {cell_style}>CHECKSUM</th><th {cell_style}>SEQUENCE_MATCH</th><th {cell_style}>MISC</th></tr>"
+
+        for species_name, species_data in species_counts.items():
+            message += f"<tr><td {cell_style}>{species_name}</td>"
+            message += f"<td {cell_style}>{species_data['DIRECT']}</td>"
+            message += f"<td {cell_style}>{species_data['DEPENDENT']}</td>"
+            message += f"<td {cell_style}>{species_data['INFERRED_PAIR']}</td>"
+            message += f"<td {cell_style}>{species_data['CHECKSUM']}</td>"
+            message += f"<td {cell_style}>{species_data['SEQUENCE_MATCH']}</td>"
+            message += f"<td {cell_style}>{species_data['MISC']}</td>"
+            message += "</tr>"
+        message += "</table>"
 
         return message
 
diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
index 9ec5a8b7e..33a60d065 100644
--- a/src/python/ensembl/production/xrefs/ScheduleCleanup.py
+++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py
@@ -35,10 +35,10 @@ def run(self):
         clean_dir: Optional[str] = self.get_param("clean_dir", {"type": str})
 
         logging.info("ScheduleCleanup starting with parameters:")
-        logging.info(f"Param: base_path = {base_path}")
-        logging.info(f"Param: source_db_url = {source_db_url}")
-        logging.info(f"Param: clean_files = {clean_files}")
-        logging.info(f"Param: clean_dir = {clean_dir}")
+        logging.info(f"\tParam: base_path = {base_path}")
+        logging.info(f"\tParam: source_db_url = {source_db_url}")
+        logging.info(f"\tParam: clean_files = {clean_files}")
+        logging.info(f"\tParam: clean_dir = {clean_dir}")
 
         # Connect to source db
         db_engine = self.get_db_engine(source_db_url)
diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py
index 10b2a32af..a49feb59d 100644
--- a/src/python/ensembl/production/xrefs/ScheduleDownload.py
+++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py
@@ -26,9 +26,9 @@ def run(self) -> None:
         reuse_db: bool = self.get_param("reuse_db", {"required": True, "type": bool})
 
         logging.info("ScheduleDownload starting with parameters:")
-        logging.info(f"Param: config_file = {config_file}")
-        logging.info(f"Param: source_db_url = {source_db_url}")
-        logging.info(f"Param: reuse_db = {reuse_db}")
+        logging.info(f"\tParam: config_file = {config_file}")
+        logging.info(f"\tParam: source_db_url = {source_db_url}")
+        logging.info(f"\tParam: reuse_db = {reuse_db}")
 
         # Create the source db from url
         self.create_source_db(source_db_url, reuse_db)
diff --git a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
index 35ee7ac80..2ed64a65f 100644
--- a/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
+++ b/src/python/ensembl/production/xrefs/mappers/CoordinateMapper.py
@@ -119,8 +119,8 @@ def run_coordinatemapping(self, species_name: str, species_id: int, scripts_dir:
             logging.info(f"Running perl script {mapper_script}")
             perl_cmd = [
                 "perl", mapper_script,
-                "--xref_db_url", str(self.xref()),
-                "--core_db_url", str(self.core()),
+                "--xref_db_url", str(self.xref().url),
+                "--core_db_url", str(self.core().url),
                 "--species_id", str(species_id),
                 "--output_dir", output_dir,
                 "--analysis_id", str(analysis_id)