RNAcentral · afg1 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/bin/litscan-get-articles.py b/bin/litscan-get-articles.py
@@ -51,6 +51,7 @@ def create_xml_file(results, directory):
             ET.SubElement(additional_fields, "field", name="score").text = item['score']
             ET.SubElement(additional_fields, "field", name="cited_by").text = item['cited_by']
             ET.SubElement(additional_fields, "field", name="type").text = item['type']
+            ET.SubElement(additional_fields, "field", name="rna_related").text = item['rna_related']
             ET.SubElement(additional_fields, "field", name="job_id").text = elem["display_id"]
             ET.SubElement(additional_fields, "field", name="title_value").text = elem['id_in_title']
             ET.SubElement(additional_fields, "field", name="abstract_value").text = elem['id_in_abstract']
@@ -114,6 +115,7 @@ def main(database, directory):
         article['score'] = str(row[8])
         article['cited_by'] = str(row[9])
         article['type'] = row[11]
+        article['rna_related'] = row[12]
         articles_list.append(article)
 
     for article in articles_list:

diff --git a/files/search-export/parts/goflow.sql b/files/search-export/parts/goflow.sql
@@ -0,0 +1,13 @@
+COPY (
+  SELECT
+    json_build_object(
+      'id', todo.id,
+      'urs_taxid', todo.urs_taxid,
+      'should_show_goflow', true
+    )
+    FROM search_export_urs todo
+    JOIN go_flow_llm_curation_results gfllm
+    ON
+      todo.urs_taxid = gfllm.urs_taxid
+    ORDER by todo.id
+) TO STDOUT
diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py
@@ -262,11 +262,11 @@ def r2dt_prepare_s3(model_info, directory, output, file_list, allow_missing):
 @click.option("--max_sequences", default=-1)
 def r2dt_prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
     """
-    Prepare the sequences extracted from RNAcentral
+    Prepare a list of URS identifiers to fetch sequences for.
 
-    This means we will load and deduplicate the json file before rewriting
-    a json file containing only the requested number of sequences.
-
-    The default will be to write out all sequences
+    This takes a file of all URS identifiers from cross-references and a file
+    of already tracked URS identifiers. It produces a file of URS identifiers
+    that are in the xref file but not in the tracked file. This can be limited
+    to a maximum number of sequences.
     """
     r2dt.prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences)
diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -25,18 +25,16 @@
 
 LOGGER = logging.getLogger(__name__)
 
-
-def list_releases(ftp: FTP) -> ty.List[str]:
-    return [f for f in ftp.nlst() if f.startswith("release-")]
-
-
-def latest_release(releases: ty.List[str], ftp: FTP) -> str:
+def latest_release(ftp: FTP) -> str:
     ## Parse the readme for the current release to avoid getting a half baked release
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
     pattern = r"Ensembl Release (\d+) Databases."
-    release = re.search(pattern, cur_readme).group(1)
+    match = re.search(pattern, cur_readme)
+    if not match:
+        raise ValueError("Could not find release number in README")
+    release = match.group(1)
     print(f"Ensembl release {release}")
     return f"release-{release}"
 
@@ -95,8 +93,7 @@ def urls_for(division: Division, host: str) -> ty.Iterable[FtpInfo]:
         ftp.login()
         print("LOGIN")
         ftp.cwd(f"pub/{division.name}/")
-        releases = list_releases(ftp)
-        latest = latest_release(releases, ftp)
+        latest = latest_release(ftp)
         with species_info(ftp, division, latest) as info:
             url_base = f"ftp://{host}/pub/{division.name}"
             yield from generate_paths(ftp, division, url_base, latest, info)
diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
@@ -23,18 +23,17 @@
 from rnacentral_pipeline.databases.ensembl.data import Division, FtpInfo
 
 
-def list_releases(ftp: FTP) -> ty.List[str]:
-    return [f for f in ftp.nlst() if f.startswith("release-")]
 
-
-def latest_release(releases: ty.List[str], ftp: FTP) -> str:
+def latest_release(ftp: FTP) -> str:
     ## Parse the readme for the current release to avoid getting a half baked release
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
     pattern = r"Ensembl Release (\d+) Databases."
-    release = re.search(pattern, cur_readme).group(1)
-    print(f"Ensembl release {release}")
+    match = re.search(pattern, cur_readme)
+    if not match:
+        raise ValueError("Could not determine latest Ensembl release from README")
+    release = match.group(1)
     return f"release-{release}"
 
 
@@ -71,7 +70,6 @@ def urls_for(host: str) -> ty.Iterable[FtpInfo]:
     with FTP(host) as ftp:
         ftp.login()
         ftp.cwd("pub")
-        releases = list_releases(ftp)
-        latest = latest_release(releases, ftp)
+        latest = latest_release(ftp)
         with species_info(ftp, latest) as info:
             yield from generate_paths(f"ftp://{host}/pub", latest, info)
diff --git a/rnacentral_pipeline/databases/generic/v1.py b/rnacentral_pipeline/databases/generic/v1.py
@@ -201,6 +201,7 @@ def features(record):
                 f"Skipping sequence feature {key} due to unexpected type {type(feature)}"
             )
             continue
+
         ## Skip sequence features that don't have the required fields
         if feature.get("indexes", None) is None:
             LOGGER.warning(

diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
@@ -131,10 +131,6 @@ def toplevel_file(
     toplevel = base.format(type="toplevel")
     base_result = f"ftp://{host}{directory}/{{file}}"
 
-    print(primary)
-    print(toplevel)
-    print(files)
-
     if primary in files:
         return base_result.format(file=primary)
     elif toplevel in files:
@@ -168,7 +164,10 @@ def url_for(species: str, assembly_id: str, kind: str, host: str, soft_masked=Fa
                 conn.retrlines("RETR current_README", readme_lines.append)
                 cur_readme = "\n".join(readme_lines)
                 pattern = r"[Cc]urrent release is (?:Ensembl )?Genomes\s*(\d+)"
-                release = re.search(pattern, cur_readme).group(1)
+                match = re.search(pattern, cur_readme)
+                if not match:
+                    raise ValueError("Could not determine latest Ensembl release from README")
+                release = match.group(1)
 
                 for path in host.paths(species, kind):
                     try:

diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
@@ -153,7 +153,6 @@ def write_inspect_data(handle: ty.IO, db_url: str, output: ty.IO):
 
 
 def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
-    print(urs_to_fetch.name)
     raw_xref = (
         pl.scan_csv(xref_urs.name, has_header=False, low_memory=True)
         .unique()
@@ -162,7 +161,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
 
     raw_tracked = pl.scan_csv(
         tracked_urs.name, low_memory=True
-    ).unique()  ## May not need to be uniqued?
+    ).unique() 
 
     to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")
 

diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py
@@ -109,7 +109,7 @@ def parse(
                 old_model_name = model_name
                 model_name = temp_model_name_lookup.get(model_name, None)
                 if model_name is None:
-                    raise ValueError("No info for model %s", old_model_name)
+                    raise ValueError(f"No info for model {old_model_name}")
 
             minfo = model_info[model_name]
             info = data.R2DTResultInfo(urs, minfo, source, result_base)

diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py
@@ -708,6 +708,9 @@ def has_publications(counts):
 def has_litsumm(litsumm):
     return str(bool(litsumm))
 
+def has_go_flow_llm_annotation(go_flow):
+    return str(bool(go_flow))
+
 
 def has_editing_event(editing_events):
     return str(bool(editing_events))
@@ -881,6 +884,7 @@ def edit_ref_to_edit(editing_events):
                     edit_repeat_type,
                     keys="editing_events",
                 ),
+                field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"),
                 ## Add new fields above this line! Otherwise editing the produced xml is hard.
                 tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
             ],

diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs
@@ -33,6 +33,7 @@ pub enum Groupable {
     SoInfo,
     LitsummSummaries,
     EditingEvents,
+    GoFlowAnnotation,
 }
 
 #[derive(Debug, StructOpt)]
@@ -140,6 +141,10 @@ enum SequenceCommand {
         /// RNA editing events
         editing_events: PathBuf,
 
+        #[structopt(parse(from_os_str))]
+        /// GoFlowLLM annotations
+        go_flow_llm_annotations: PathBuf,
+
         // Add new arguments above this line!
         #[structopt(parse(from_os_str))]
         /// Filename to write the results to, '-' means stdout
@@ -255,6 +260,7 @@ fn main() -> Result<()> {
             Groupable::EditingEvents => {
                 sequences::editing_events::group(&path, max_count, &output)?
             },
+            Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?,
         },
         Subcommand::Sequences {
             command,
@@ -275,6 +281,8 @@ fn main() -> Result<()> {
                 litsumm_summaries,
                 editing_events,
                 so_term_tree,
+                go_flow_llm_annotations,
+                // Add new arguments above this line!
                 output,
             } => sequences::writers::write_merge(
                 vec![
@@ -293,6 +301,7 @@ fn main() -> Result<()> {
                     editing_events,
                     orfs,
                     so_term_tree,
+                    go_flow_llm_annotations,
                 ],
                 &output,
             )?,

diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs
@@ -49,6 +49,7 @@ use super::{
     rfam_hit::RfamHit,
     so_tree,
     so_tree::SoMapping,
+    go_flow_annotations::GoFlowLLMAnnotation,
 };
 
 #[derive(Debug, Error)]
@@ -98,6 +99,7 @@ pub enum FileTypes {
     PublicationCount,
     LitsummSummaries,
     EditingEvents,
+    GoFlowLLMAnnotations,
     SoTermTree,
 }
 
@@ -116,6 +118,7 @@ pub struct FileJoiner<'de> {
     rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
     publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
     lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
+    go_flow_llm_annotations: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
     editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
     so_info: SoMapping,
 }
@@ -203,6 +206,7 @@ impl FileJoinerBuilder {
         let publication_counts = self.iterator_for(FileTypes::PublicationCount)?;
         let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?;
         let editing_events = self.iterator_for(FileTypes::EditingEvents)?;
+        let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?;
         let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?;
 
         Ok(FileJoiner {
@@ -220,6 +224,7 @@ impl FileJoinerBuilder {
             publication_counts,
             lit_summ,
             editing_events,
+            go_flow_llm_annotations,
             so_info,
         })
     }
@@ -244,6 +249,7 @@ impl<'de> Iterator for FileJoiner<'de> {
             self.publication_counts.next(),
             self.lit_summ.next(),
             self.editing_events.next(),
+            self.go_flow_llm_annotations.next(),
         );
 
         match current {
@@ -262,6 +268,7 @@ impl<'de> Iterator for FileJoiner<'de> {
                 None,
                 None,
                 None,
+                None,
             ) => None,
             (
                 Some(Ok(Required {
@@ -320,6 +327,10 @@ impl<'de> Iterator for FileJoiner<'de> {
                     id: id14,
                     data: editing_events,
                 })),
+                Some(Ok(Multiple {
+                    id: id15,
+                    data: goflow_llm_annotations,
+                })),
             ) => {
                 if id1 != id2
                     || id1 != id3
@@ -334,9 +345,10 @@ impl<'de> Iterator for FileJoiner<'de> {
                     || id1 != id12
                     || id1 != id13
                     || id1 != id14
+                    || id1 != id15
                 {
                     return Some(Err(Error::OutofSyncData(vec![
-                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
+                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15
                     ])));
                 }
 
@@ -362,6 +374,7 @@ impl<'de> Iterator for FileJoiner<'de> {
                     .publication_counts(publication_counts)
                     .litsumm_summaries(lit_summ)
                     .editing_events(editing_events)
+                    .go_flow_llm_annotations(goflow_llm_annotations)
                     .so_tree(so_tree)
                     .build();
 

diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs
@@ -0,0 +1,34 @@
+use serde::{
+    Deserialize,
+    Serialize,
+};
+use std::path::Path;
+
+use anyhow::Result;
+use rnc_core::grouper;
+
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct GoFlowLLMAnnotation {
+    pub id: usize,
+    urs_taxid: String,
+    should_show_goflow: bool,
+}
+
+impl grouper::HasIndex for GoFlowLLMAnnotation {
+    fn index(&self) -> usize {
+        self.id
+    }
+}
+
+pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
+    grouper::group::<GoFlowLLMAnnotation>(grouper::Criteria::AnyNumber, &path, 1, max, &output)
+}
+
+impl GoFlowLLMAnnotation {
+    pub fn should_show_goflow(&self) -> bool {
+        self.should_show_goflow
+    }
+    pub fn urs_taxid(&self) -> &str {
+        &self.urs_taxid
+    }
+}
diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs
@@ -16,5 +16,6 @@ pub mod qa_status;
 pub mod r2dt;
 pub mod raw;
 pub mod rfam_hit;
+pub mod go_flow_annotations;
 pub mod so_tree;
 pub mod writers;
diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs
@@ -37,6 +37,7 @@ use crate::sequences::{
     r2dt::R2dt,
     raw::Raw,
     rfam_hit::RfamHitVec,
+    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 
@@ -69,6 +70,7 @@ pub struct Normalized {
     publication_count: usize,
     litsumm: Vec<LitsummSummaries>,
     editing_events: Vec<EditingEvent>,
+    go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
     so_rna_type_tree: so_tree::SoTree,
 
     #[serde(flatten)]
@@ -129,6 +131,7 @@ impl Normalized {
             rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(),
             orfs: raw.orfs().to_vec().into_iter().collect(),
             litsumm: raw.litsumm_summaries().to_vec(),
+            go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(),
             editing_events: raw.editing_events().to_vec(),
         })
     }