Skip to content
Draft
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
aaf6d9a
Rust side implementation for GoFlow search index
afg1 Oct 13, 2025
53f2fef
Python bits of the goflowllm search index export
afg1 Oct 13, 2025
ffa4e42
Add necessary sql and nextflow bits
afg1 Oct 13, 2025
7c0b270
Merge branch 'dev' into gfllm-search-export
afg1 Oct 13, 2025
ca3ee09
Update rnacentral_pipeline/cli/r2dt.py
afg1 Oct 14, 2025
bbb78e2
Update rnacentral_pipeline/databases/ensembl/genomes/urls.py
afg1 Oct 14, 2025
c40cef0
Remove unused fetching of ensembl latest release from parsing release…
afg1 Oct 14, 2025
3e818d6
Remove some debugging print statements
afg1 Oct 14, 2025
a4901b7
Update rnacentral_pipeline/databases/ensembl/genomes/urls.py
afg1 Oct 18, 2025
0efdf8d
Remove some dead code relating to finding the ensembl release number
afg1 Oct 18, 2025
f30bead
Reinstate conditionals for running r2dt
afg1 Oct 18, 2025
0c2d944
Fix other instances of release matching not being defenzive about no …
afg1 Oct 18, 2025
96d37fd
Update rnacentral_pipeline/rnacentral/r2dt/parser.py
afg1 Oct 18, 2025
29dc710
Link up go flow search export processes properly
afg1 Oct 25, 2025
6332d64
Use less stringent regex in ensembl release detection
afg1 Oct 27, 2025
cd6e07d
Remove some trailing whitespace
afg1 Oct 27, 2025
fa07512
Raise value errors instead of relying on assertions
afg1 Oct 27, 2025
b045965
Merge branch 'dev' into gfllm-search-export
afg1 Oct 27, 2025
f00a676
Fix not passing goflow data to merging process correctly
afg1 Oct 27, 2025
acdba3c
Update expected key from search export rust code
afg1 Oct 27, 2025
fa64872
Rust side CLI requires kebab-case filename to match file type enum, s…
afg1 Oct 27, 2025
77583ac
Improve documentation comment in sequence raw handler
afg1 Oct 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions files/search-export/parts/goflow.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
COPY (
SELECT
json_build_object(
'id', todo.id,
'urs_taxid', todo.urs_taxid,
'should_show_goflow', true
)
FROM search_export_urs todo
JOIN go_flow_llm_curation_results gfllm
ON
todo.urs_taxid = gfllm.urs_taxid
ORDER by todo.id
) TO STDOUT
5 changes: 3 additions & 2 deletions rnacentral_pipeline/databases/ensembl/genomes/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ def latest_release(ftp: FTP) -> str:
readme_lines = []
ftp.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"Ensembl Release (\d+) Databases."
match = re.search(pattern, cur_readme)
pattern = r"Ensembl Release (\d+) Databases\."
match = re.search(pattern, cur_readme, re.IGNORECASE)

if not match:
raise ValueError("Could not find release number in README")
release = match.group(1)
Expand Down
5 changes: 3 additions & 2 deletions rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ def latest_release(ftp: FTP) -> str:
readme_lines = []
ftp.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"Ensembl Release (\d+) Databases."
match = re.search(pattern, cur_readme)
pattern = r"Ensembl Release (\d+) Databases\."
match = re.search(pattern, cur_readme, re.IGNORECASE)

if not match:
raise ValueError("Could not determine latest Ensembl release from README")
release = match.group(1)
Expand Down
4 changes: 1 addition & 3 deletions rnacentral_pipeline/rnacentral/r2dt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
.rename({"column_1": "urs"})
)

raw_tracked = pl.scan_csv(
tracked_urs.name, low_memory=True
).unique()
raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique()

to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")

Expand Down
8 changes: 6 additions & 2 deletions rnacentral_pipeline/rnacentral/r2dt/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,11 +409,15 @@ def dot_bracket(self):
seq_dot = str(record.seq)
## Use indices instead, assert that the string is even length
## If not, then the two parts are not the same length
assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}"
if len(seq_dot) % 2 != 0:
raise ValueError(f"Odd length sequence {len(seq_dot)}")
seq_dot_len = len(seq_dot)
sequence = seq_dot[0 : seq_dot_len // 2]
dot_bracket = seq_dot[(seq_dot_len // 2) :]
assert len(sequence) == len(dot_bracket)
if len(sequence) != len(dot_bracket):
raise ValueError(
f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}"
)
return dot_bracket

def basepair_count(self):
Expand Down
4 changes: 4 additions & 0 deletions rnacentral_pipeline/rnacentral/search_export/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,9 @@ def has_publications(counts):
def has_litsumm(litsumm):
return str(bool(litsumm))

def has_go_flow_llm_annotation(go_flow):
return str(bool(go_flow))


def has_editing_event(editing_events):
return str(bool(editing_events))
Expand Down Expand Up @@ -881,6 +884,7 @@ def edit_ref_to_edit(editing_events):
edit_repeat_type,
keys="editing_events",
),
field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The key goflow used here does not match the field name go_flow_llm_annotations in the Normalized Rust struct that generates the JSON data. This will result in a KeyError when the Python script tries to access the data. To ensure consistency with other fields like litsumm and editing_events, I've suggested a change in the Rust code to rename the field to goflow during serialization. Alternatively, you could change this key to go_flow_llm_annotations.

## Add new fields above this line! Otherwise editing the produced xml is hard.
tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
],
Expand Down
9 changes: 9 additions & 0 deletions utils/search-export/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub enum Groupable {
SoInfo,
LitsummSummaries,
EditingEvents,
GoFlowAnnotation,
}

#[derive(Debug, StructOpt)]
Expand Down Expand Up @@ -140,6 +141,10 @@ enum SequenceCommand {
/// RNA editing events
editing_events: PathBuf,

#[structopt(parse(from_os_str))]
/// GoFlowLLM annotations
go_flow_llm_annotations: PathBuf,

// Add new arguments above this line!
#[structopt(parse(from_os_str))]
/// Filename to write the results to, '-' means stdout
Expand Down Expand Up @@ -255,6 +260,7 @@ fn main() -> Result<()> {
Groupable::EditingEvents => {
sequences::editing_events::group(&path, max_count, &output)?
},
Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?,
},
Subcommand::Sequences {
command,
Expand All @@ -275,6 +281,8 @@ fn main() -> Result<()> {
litsumm_summaries,
editing_events,
so_term_tree,
go_flow_llm_annotations,
// Add new arguments above this line!
output,
} => sequences::writers::write_merge(
vec![
Expand All @@ -293,6 +301,7 @@ fn main() -> Result<()> {
editing_events,
orfs,
so_term_tree,
go_flow_llm_annotations,
],
&output,
)?,
Expand Down
15 changes: 14 additions & 1 deletion utils/search-export/src/sequences/file_joiner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use super::{
rfam_hit::RfamHit,
so_tree,
so_tree::SoMapping,
go_flow_annotations::GoFlowLLMAnnotation,
};

#[derive(Debug, Error)]
Expand Down Expand Up @@ -98,6 +99,7 @@ pub enum FileTypes {
PublicationCount,
LitsummSummaries,
EditingEvents,
GoFlowLLMAnnotations,
SoTermTree,
}

Expand All @@ -116,6 +118,7 @@ pub struct FileJoiner<'de> {
rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
go_flow_llm_annotations: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
so_info: SoMapping,
}
Expand Down Expand Up @@ -203,6 +206,7 @@ impl FileJoinerBuilder {
let publication_counts = self.iterator_for(FileTypes::PublicationCount)?;
let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?;
let editing_events = self.iterator_for(FileTypes::EditingEvents)?;
let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?;
let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?;

Ok(FileJoiner {
Expand All @@ -220,6 +224,7 @@ impl FileJoinerBuilder {
publication_counts,
lit_summ,
editing_events,
go_flow_llm_annotations,
so_info,
})
}
Expand All @@ -244,6 +249,7 @@ impl<'de> Iterator for FileJoiner<'de> {
self.publication_counts.next(),
self.lit_summ.next(),
self.editing_events.next(),
self.go_flow_llm_annotations.next(),
);

match current {
Expand All @@ -262,6 +268,7 @@ impl<'de> Iterator for FileJoiner<'de> {
None,
None,
None,
None,
) => None,
(
Some(Ok(Required {
Expand Down Expand Up @@ -320,6 +327,10 @@ impl<'de> Iterator for FileJoiner<'de> {
id: id14,
data: editing_events,
})),
Some(Ok(Multiple {
id: id15,
data: goflow_llm_annotations,
})),
) => {
if id1 != id2
|| id1 != id3
Expand All @@ -334,9 +345,10 @@ impl<'de> Iterator for FileJoiner<'de> {
|| id1 != id12
|| id1 != id13
|| id1 != id14
|| id1 != id15
{
return Some(Err(Error::OutofSyncData(vec![
id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15
])));
}

Expand All @@ -362,6 +374,7 @@ impl<'de> Iterator for FileJoiner<'de> {
.publication_counts(publication_counts)
.litsumm_summaries(lit_summ)
.editing_events(editing_events)
.go_flow_llm_annotations(goflow_llm_annotations)
.so_tree(so_tree)
.build();

Expand Down
34 changes: 34 additions & 0 deletions utils/search-export/src/sequences/go_flow_annotations.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use serde::{
Deserialize,
Serialize,
};
use std::path::Path;

use anyhow::Result;
use rnc_core::grouper;

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct GoFlowLLMAnnotation {
pub id: usize,
urs_taxid: String,
should_show_goflow: bool,
}

impl grouper::HasIndex for GoFlowLLMAnnotation {
fn index(&self) -> usize {
self.id
}
}

pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
grouper::group::<GoFlowLLMAnnotation>(grouper::Criteria::AnyNumber, &path, 1, max, &output)
}

impl GoFlowLLMAnnotation {
pub fn should_show_goflow(&self) -> bool {
self.should_show_goflow
}
pub fn urs_taxid(&self) -> &str {
&self.urs_taxid
}
}
1 change: 1 addition & 0 deletions utils/search-export/src/sequences/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ pub mod qa_status;
pub mod r2dt;
pub mod raw;
pub mod rfam_hit;
pub mod go_flow_annotations;
pub mod so_tree;
pub mod writers;
3 changes: 3 additions & 0 deletions utils/search-export/src/sequences/normalized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::sequences::{
r2dt::R2dt,
raw::Raw,
rfam_hit::RfamHitVec,
go_flow_annotations::GoFlowLLMAnnotation,
so_tree,
};

Expand Down Expand Up @@ -69,6 +70,7 @@ pub struct Normalized {
publication_count: usize,
litsumm: Vec<LitsummSummaries>,
editing_events: Vec<EditingEvent>,
go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
so_rna_type_tree: so_tree::SoTree,

#[serde(flatten)]
Expand Down Expand Up @@ -129,6 +131,7 @@ impl Normalized {
rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(),
orfs: raw.orfs().to_vec().into_iter().collect(),
litsumm: raw.litsumm_summaries().to_vec(),
go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(),
editing_events: raw.editing_events().to_vec(),
})
}
Expand Down
7 changes: 7 additions & 0 deletions utils/search-export/src/sequences/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use crate::sequences::{
qa_status::QaStatus,
r2dt::R2dt,
rfam_hit::RfamHit,
go_flow_annotations::GoFlowLLMAnnotation,
so_tree,
};

Expand All @@ -46,6 +47,7 @@ pub struct Raw {
publication_counts: Option<PublicationCount>,
litsumm_summaries: Vec<LitsummSummaries>,
editing_events: Vec<EditingEvent>,
go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
so_tree: so_tree::SoTree,
}

Expand Down Expand Up @@ -148,6 +150,11 @@ impl Raw {
&self.editing_events
}

/// Get a reference to the raw's editing events.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This documentation comment appears to be a copy-paste from the editing_events function. It should be updated to describe the go_flow_llm_annotations function.

Suggested change
/// Get a reference to the raw's editing events.
/// Get a reference to the raw's go flow llm annotations.

pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] {
&self.go_flow_llm_annotations
}

/// Get this raw's publication count.
pub fn publication_count(&self) -> usize {
self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0)
Expand Down
17 changes: 17 additions & 0 deletions workflows/export/text-search/sequences.nf
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ process build_metadata {
path(text)
path(litsumm)
path(editing_events)
path(go_flow_annotations)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The go_flow_annotations input is correctly added to this process, but it is not being passed to the search-export sequences merge command within the script block. This omission will cause the pipeline to fail, as the file_joiner in the Rust application expects this file. Please add $go_flow_annotations to the arguments of the search-export command.

path(so_tree)

output:
Expand Down Expand Up @@ -141,6 +142,20 @@ process litsumm_summaries {
"""
}

process go_flow_annotations {
input:
val(max_count)
path (query)

output:
path("goflow_annotations.json")

"""
psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json
search-export group go-flow-annotation raw.json ${max_count} goflow_annotations.json
"""
}

process editing_events {
input:
val(max_count)
Expand Down Expand Up @@ -201,6 +216,7 @@ workflow sequences {
Channel.fromPath('files/search-export/parts/text-mining.sql') | set { text_sql }
Channel.fromPath('files/search-export/parts/litsumm.sql') | set { litsumm_sql }
Channel.fromPath('files/search-export/parts/editing-events.sql') | set { editing_events_sql }
Channel.fromPath('files/search-export/parts/goflow.sql') | set { goflow_sql }
Channel.fromPath('files/search-export/so-rna-types.sql') | set { so_sql }

Channel.fromPath('files/search-export/parts/accessions.sql') | set { accessions_sql }
Expand Down Expand Up @@ -230,6 +246,7 @@ workflow sequences {
text_mining_query(search_count, text_sql),
litsumm_summaries(search_count, litsumm_sql),
editing_events(search_count, editing_events_sql),
go_flow_annotations(search_count, goflow_sql),
fetch_so_tree(so_sql),
)\
| set { metadata }
Expand Down