RNAcentral · afg1 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/files/search-export/parts/goflow.sql b/files/search-export/parts/goflow.sql
@@ -0,0 +1,13 @@
+COPY (
+  SELECT
+    json_build_object(
+      'id', todo.id,
+      'urs_taxid', todo.urs_taxid,
+      'should_show_goflow', true
+    )
+    FROM search_export_urs todo
+    JOIN go_flow_llm_curation_results gfllm
+    ON
+      todo.urs_taxid = gfllm.urs_taxid
+    ORDER by todo.id
+) TO STDOUT
diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -30,8 +30,9 @@ def latest_release(ftp: FTP) -> str:
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
-    pattern = r"Ensembl Release (\d+) Databases."
-    match = re.search(pattern, cur_readme)
+    pattern = r"Ensembl Release (\d+) Databases\."
+    match = re.search(pattern, cur_readme, re.IGNORECASE)
+
     if not match:
         raise ValueError("Could not find release number in README")
     release = match.group(1)

diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
@@ -29,8 +29,9 @@ def latest_release(ftp: FTP) -> str:
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
-    pattern = r"Ensembl Release (\d+) Databases."
-    match = re.search(pattern, cur_readme)
+    pattern = r"Ensembl Release (\d+) Databases\."
+    match = re.search(pattern, cur_readme, re.IGNORECASE)
+
     if not match:
         raise ValueError("Could not determine latest Ensembl release from README")
     release = match.group(1)

diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
@@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
         .rename({"column_1": "urs"})
     )
 
-    raw_tracked = pl.scan_csv(
-        tracked_urs.name, low_memory=True
-    ).unique() 
+    raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique()
 
     to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")
 

diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py
@@ -409,11 +409,15 @@ def dot_bracket(self):
             seq_dot = str(record.seq)
             ## Use indices instead, assert that the string is even length
             ## If not, then the two parts are not the same length
-            assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}"
+            if len(seq_dot) % 2 != 0:
+                raise ValueError(f"Odd length sequence {len(seq_dot)}")
             seq_dot_len = len(seq_dot)
             sequence = seq_dot[0 : seq_dot_len // 2]
             dot_bracket = seq_dot[(seq_dot_len // 2) :]
-            assert len(sequence) == len(dot_bracket)
+            if len(sequence) != len(dot_bracket):
+                raise ValueError(
+                    f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}"
+                )
             return dot_bracket
 
     def basepair_count(self):

diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py
@@ -708,6 +708,9 @@ def has_publications(counts):
 def has_litsumm(litsumm):
     return str(bool(litsumm))
 
+def has_go_flow_llm_annotation(go_flow):
+    return str(bool(go_flow))
+
 
 def has_editing_event(editing_events):
     return str(bool(editing_events))
@@ -881,6 +884,7 @@ def edit_ref_to_edit(editing_events):
                     edit_repeat_type,
                     keys="editing_events",
                 ),
+                field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"),
                 ## Add new fields above this line! Otherwise editing the produced xml is hard.
                 tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
             ],

diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs
@@ -33,6 +33,7 @@ pub enum Groupable {
     SoInfo,
     LitsummSummaries,
     EditingEvents,
+    GoFlowAnnotation,
 }
 
 #[derive(Debug, StructOpt)]
@@ -140,6 +141,10 @@ enum SequenceCommand {
         /// RNA editing events
         editing_events: PathBuf,
 
+        #[structopt(parse(from_os_str))]
+        /// GoFlowLLM annotations
+        go_flow_llm_annotations: PathBuf,
+
         // Add new arguments above this line!
         #[structopt(parse(from_os_str))]
         /// Filename to write the results to, '-' means stdout
@@ -255,6 +260,7 @@ fn main() -> Result<()> {
             Groupable::EditingEvents => {
                 sequences::editing_events::group(&path, max_count, &output)?
             },
+            Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?,
         },
         Subcommand::Sequences {
             command,
@@ -275,6 +281,8 @@ fn main() -> Result<()> {
                 litsumm_summaries,
                 editing_events,
                 so_term_tree,
+                go_flow_llm_annotations,
+                // Add new arguments above this line!
                 output,
             } => sequences::writers::write_merge(
                 vec![
@@ -293,6 +301,7 @@ fn main() -> Result<()> {
                     editing_events,
                     orfs,
                     so_term_tree,
+                    go_flow_llm_annotations,
                 ],
                 &output,
             )?,

diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs
@@ -49,6 +49,7 @@ use super::{
     rfam_hit::RfamHit,
     so_tree,
     so_tree::SoMapping,
+    go_flow_annotations::GoFlowLLMAnnotation,
 };
 
 #[derive(Debug, Error)]
@@ -98,6 +99,7 @@ pub enum FileTypes {
     PublicationCount,
     LitsummSummaries,
     EditingEvents,
+    GoFlowLLMAnnotations,
     SoTermTree,
 }
 
@@ -116,6 +118,7 @@ pub struct FileJoiner<'de> {
     rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
     publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
     lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
+    go_flow_llm_annotations: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
     editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
     so_info: SoMapping,
 }
@@ -203,6 +206,7 @@ impl FileJoinerBuilder {
         let publication_counts = self.iterator_for(FileTypes::PublicationCount)?;
         let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?;
         let editing_events = self.iterator_for(FileTypes::EditingEvents)?;
+        let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?;
         let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?;
 
         Ok(FileJoiner {
@@ -220,6 +224,7 @@ impl FileJoinerBuilder {
             publication_counts,
             lit_summ,
             editing_events,
+            go_flow_llm_annotations,
             so_info,
         })
     }
@@ -244,6 +249,7 @@ impl<'de> Iterator for FileJoiner<'de> {
             self.publication_counts.next(),
             self.lit_summ.next(),
             self.editing_events.next(),
+            self.go_flow_llm_annotations.next(),
         );
 
         match current {
@@ -262,6 +268,7 @@ impl<'de> Iterator for FileJoiner<'de> {
                 None,
                 None,
                 None,
+                None,
             ) => None,
             (
                 Some(Ok(Required {
@@ -320,6 +327,10 @@ impl<'de> Iterator for FileJoiner<'de> {
                     id: id14,
                     data: editing_events,
                 })),
+                Some(Ok(Multiple {
+                    id: id15,
+                    data: goflow_llm_annotations,
+                })),
             ) => {
                 if id1 != id2
                     || id1 != id3
@@ -334,9 +345,10 @@ impl<'de> Iterator for FileJoiner<'de> {
                     || id1 != id12
                     || id1 != id13
                     || id1 != id14
+                    || id1 != id15
                 {
                     return Some(Err(Error::OutofSyncData(vec![
-                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
+                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15
                     ])));
                 }
 
@@ -362,6 +374,7 @@ impl<'de> Iterator for FileJoiner<'de> {
                     .publication_counts(publication_counts)
                     .litsumm_summaries(lit_summ)
                     .editing_events(editing_events)
+                    .go_flow_llm_annotations(goflow_llm_annotations)
                     .so_tree(so_tree)
                     .build();
 

diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs
@@ -0,0 +1,34 @@
+use serde::{
+    Deserialize,
+    Serialize,
+};
+use std::path::Path;
+
+use anyhow::Result;
+use rnc_core::grouper;
+
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct GoFlowLLMAnnotation {
+    pub id: usize,
+    urs_taxid: String,
+    should_show_goflow: bool,
+}
+
+impl grouper::HasIndex for GoFlowLLMAnnotation {
+    fn index(&self) -> usize {
+        self.id
+    }
+}
+
+pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
+    grouper::group::<GoFlowLLMAnnotation>(grouper::Criteria::AnyNumber, &path, 1, max, &output)
+}
+
+impl GoFlowLLMAnnotation {
+    pub fn should_show_goflow(&self) -> bool {
+        self.should_show_goflow
+    }
+    pub fn urs_taxid(&self) -> &str {
+        &self.urs_taxid
+    }
+}
diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs
@@ -16,5 +16,6 @@ pub mod qa_status;
 pub mod r2dt;
 pub mod raw;
 pub mod rfam_hit;
+pub mod go_flow_annotations;
 pub mod so_tree;
 pub mod writers;
diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs
@@ -37,6 +37,7 @@ use crate::sequences::{
     r2dt::R2dt,
     raw::Raw,
     rfam_hit::RfamHitVec,
+    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 
@@ -69,6 +70,7 @@ pub struct Normalized {
     publication_count: usize,
     litsumm: Vec<LitsummSummaries>,
     editing_events: Vec<EditingEvent>,
+    go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
     so_rna_type_tree: so_tree::SoTree,
 
     #[serde(flatten)]
@@ -129,6 +131,7 @@ impl Normalized {
             rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(),
             orfs: raw.orfs().to_vec().into_iter().collect(),
             litsumm: raw.litsumm_summaries().to_vec(),
+            go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(),
             editing_events: raw.editing_events().to_vec(),
         })
     }

diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs
@@ -24,6 +24,7 @@ use crate::sequences::{
     qa_status::QaStatus,
     r2dt::R2dt,
     rfam_hit::RfamHit,
+    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 
@@ -46,6 +47,7 @@ pub struct Raw {
     publication_counts: Option<PublicationCount>,
     litsumm_summaries: Vec<LitsummSummaries>,
     editing_events: Vec<EditingEvent>,
+    go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
     so_tree: so_tree::SoTree,
 }
 
@@ -148,6 +150,11 @@ impl Raw {
         &self.editing_events
     }
 
+    /// Get a reference to the raw's editing events.
-    /// Get a reference to the raw's editing events.
+    /// Get a reference to the raw's go flow llm annotations.
-    /// Get a reference to the raw's editing events.
+    /// Get a reference to the raw's go flow llm annotations.
+    pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] {
+        &self.go_flow_llm_annotations
+    }
+
     /// Get this raw's publication count.
     pub fn publication_count(&self) -> usize {
         self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0)

diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf
@@ -65,6 +65,7 @@ process build_metadata {
   path(text)
   path(litsumm)
   path(editing_events)
+  path(go_flow_annotations)
   path(so_tree)
 
   output:
@@ -141,6 +142,20 @@ process litsumm_summaries {
   """
 }
 
+process go_flow_annotations {
+  input:
+  val(max_count)
+  path (query)
+
+  output:
+  path("goflow_annotations.json")
+
+  """
+  psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json
+  search-export group go-flow-annotation raw.json ${max_count} goflow_annotations.json
+  """
+}
+
 process editing_events {
   input:
   val(max_count)
@@ -201,6 +216,7 @@ workflow sequences {
     Channel.fromPath('files/search-export/parts/text-mining.sql') | set { text_sql }
     Channel.fromPath('files/search-export/parts/litsumm.sql') | set { litsumm_sql }
     Channel.fromPath('files/search-export/parts/editing-events.sql') | set { editing_events_sql }
+    Channel.fromPath('files/search-export/parts/goflow.sql') | set { goflow_sql }
     Channel.fromPath('files/search-export/so-rna-types.sql') | set { so_sql }
 
     Channel.fromPath('files/search-export/parts/accessions.sql') | set { accessions_sql }
@@ -230,6 +246,7 @@ workflow sequences {
       text_mining_query(search_count, text_sql),
       litsumm_summaries(search_count, litsumm_sql),
       editing_events(search_count, editing_events_sql),
+      go_flow_annotations(search_count, goflow_sql),
       fetch_so_tree(so_sql),
     )\
     | set { metadata }