Skip to content
Draft
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
aaf6d9a
Rust side implementation for GoFlow search index
afg1 Oct 13, 2025
53f2fef
Python bits of the goflowllm search index export
afg1 Oct 13, 2025
ffa4e42
Add necessary sql and nextflow bits
afg1 Oct 13, 2025
7c0b270
Merge branch 'dev' into gfllm-search-export
afg1 Oct 13, 2025
ca3ee09
Update rnacentral_pipeline/cli/r2dt.py
afg1 Oct 14, 2025
bbb78e2
Update rnacentral_pipeline/databases/ensembl/genomes/urls.py
afg1 Oct 14, 2025
c40cef0
Remove unused fetching of ensembl latest release from parsing release…
afg1 Oct 14, 2025
3e818d6
Remove some debugging print statements
afg1 Oct 14, 2025
a4901b7
Update rnacentral_pipeline/databases/ensembl/genomes/urls.py
afg1 Oct 18, 2025
0efdf8d
Remove some dead code relating to finding the ensembl release number
afg1 Oct 18, 2025
f30bead
Reinstate conditionals for running r2dt
afg1 Oct 18, 2025
0c2d944
Fix other instances of release matching not being defenzive about no …
afg1 Oct 18, 2025
96d37fd
Update rnacentral_pipeline/rnacentral/r2dt/parser.py
afg1 Oct 18, 2025
29dc710
Link up go flow search export processes properly
afg1 Oct 25, 2025
6332d64
Use less stringent regex in ensembl release detection
afg1 Oct 27, 2025
cd6e07d
Remove some trailing whitespace
afg1 Oct 27, 2025
fa07512
Raise value errors instead of relying on assertions
afg1 Oct 27, 2025
b045965
Merge branch 'dev' into gfllm-search-export
afg1 Oct 27, 2025
f00a676
Fix not passing goflow data to merging process correctly
afg1 Oct 27, 2025
acdba3c
Update expected key from search export rust code
afg1 Oct 27, 2025
fa64872
Rust side CLI requires kebab-case filename to match file type enum, s…
afg1 Oct 27, 2025
77583ac
Improve documentation comment in sequence raw handler
afg1 Oct 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bin/litscan-get-articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def create_xml_file(results, directory):
ET.SubElement(additional_fields, "field", name="score").text = item['score']
ET.SubElement(additional_fields, "field", name="cited_by").text = item['cited_by']
ET.SubElement(additional_fields, "field", name="type").text = item['type']
ET.SubElement(additional_fields, "field", name="rna_related").text = item['rna_related']
ET.SubElement(additional_fields, "field", name="job_id").text = elem["display_id"]
ET.SubElement(additional_fields, "field", name="title_value").text = elem['id_in_title']
ET.SubElement(additional_fields, "field", name="abstract_value").text = elem['id_in_abstract']
Expand Down Expand Up @@ -114,6 +115,7 @@ def main(database, directory):
article['score'] = str(row[8])
article['cited_by'] = str(row[9])
article['type'] = row[11]
article['rna_related'] = row[12]
articles_list.append(article)

for article in articles_list:
Expand Down
13 changes: 13 additions & 0 deletions files/search-export/parts/goflow.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
COPY (
SELECT
json_build_object(
'id', todo.id,
'urs_taxid', todo.urs_taxid,
'should_show_goflow', true
)
FROM search_export_urs todo
JOIN go_flow_llm_curation_results gfllm
ON
todo.urs_taxid = gfllm.urs_taxid
ORDER by todo.id
) TO STDOUT
10 changes: 5 additions & 5 deletions rnacentral_pipeline/cli/r2dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,11 @@ def r2dt_prepare_s3(model_info, directory, output, file_list, allow_missing):
@click.option("--max_sequences", default=-1)
def r2dt_prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
"""
Prepare the sequences extracted from RNAcentral
Prepare a list of URS identifiers to fetch sequences for.

This means we will load and deduplicate the json file before rewriting
a json file containing only the requested number of sequences.

The default will be to write out all sequences
This takes a file of all URS identifiers from cross-references and a file
of already tracked URS identifiers. It produces a file of URS identifiers
that are in the xref file but not in the tracked file. This can be limited
to a maximum number of sequences.
"""
r2dt.prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences)
15 changes: 6 additions & 9 deletions rnacentral_pipeline/databases/ensembl/genomes/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,16 @@

LOGGER = logging.getLogger(__name__)


def list_releases(ftp: FTP) -> ty.List[str]:
return [f for f in ftp.nlst() if f.startswith("release-")]


def latest_release(releases: ty.List[str], ftp: FTP) -> str:
def latest_release(ftp: FTP) -> str:
## Parse the readme for the current release to avoid getting a half baked release
readme_lines = []
ftp.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"Ensembl Release (\d+) Databases."
release = re.search(pattern, cur_readme).group(1)
match = re.search(pattern, cur_readme)
if not match:
raise ValueError("Could not find release number in README")
release = match.group(1)
print(f"Ensembl release {release}")
return f"release-{release}"

Expand Down Expand Up @@ -95,8 +93,7 @@ def urls_for(division: Division, host: str) -> ty.Iterable[FtpInfo]:
ftp.login()
print("LOGIN")
ftp.cwd(f"pub/{division.name}/")
releases = list_releases(ftp)
latest = latest_release(releases, ftp)
latest = latest_release(ftp)
with species_info(ftp, division, latest) as info:
url_base = f"ftp://{host}/pub/{division.name}"
yield from generate_paths(ftp, division, url_base, latest, info)
14 changes: 6 additions & 8 deletions rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,17 @@
from rnacentral_pipeline.databases.ensembl.data import Division, FtpInfo


def list_releases(ftp: FTP) -> ty.List[str]:
return [f for f in ftp.nlst() if f.startswith("release-")]


def latest_release(releases: ty.List[str], ftp: FTP) -> str:
def latest_release(ftp: FTP) -> str:
## Parse the readme for the current release to avoid getting a half baked release
readme_lines = []
ftp.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"Ensembl Release (\d+) Databases."
release = re.search(pattern, cur_readme).group(1)
print(f"Ensembl release {release}")
match = re.search(pattern, cur_readme)
if not match:
raise ValueError("Could not determine latest Ensembl release from README")
release = match.group(1)
return f"release-{release}"


Expand Down Expand Up @@ -71,7 +70,6 @@ def urls_for(host: str) -> ty.Iterable[FtpInfo]:
with FTP(host) as ftp:
ftp.login()
ftp.cwd("pub")
releases = list_releases(ftp)
latest = latest_release(releases, ftp)
latest = latest_release(ftp)
with species_info(ftp, latest) as info:
yield from generate_paths(f"ftp://{host}/pub", latest, info)
1 change: 1 addition & 0 deletions rnacentral_pipeline/databases/generic/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ def features(record):
f"Skipping sequence feature {key} due to unexpected type {type(feature)}"
)
continue

## Skip sequence features that don't have the required fields
if feature.get("indexes", None) is None:
LOGGER.warning(
Expand Down
9 changes: 4 additions & 5 deletions rnacentral_pipeline/rnacentral/genome_mapping/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,6 @@ def toplevel_file(
toplevel = base.format(type="toplevel")
base_result = f"ftp://{host}{directory}/{{file}}"

print(primary)
print(toplevel)
print(files)

if primary in files:
return base_result.format(file=primary)
elif toplevel in files:
Expand Down Expand Up @@ -168,7 +164,10 @@ def url_for(species: str, assembly_id: str, kind: str, host: str, soft_masked=Fa
conn.retrlines("RETR current_README", readme_lines.append)
cur_readme = "\n".join(readme_lines)
pattern = r"[Cc]urrent release is (?:Ensembl )?Genomes\s*(\d+)"
release = re.search(pattern, cur_readme).group(1)
match = re.search(pattern, cur_readme)
if not match:
raise ValueError("Could not determine latest Ensembl release from README")
release = match.group(1)

for path in host.paths(species, kind):
try:
Expand Down
3 changes: 1 addition & 2 deletions rnacentral_pipeline/rnacentral/r2dt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def write_inspect_data(handle: ty.IO, db_url: str, output: ty.IO):


def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
print(urs_to_fetch.name)
raw_xref = (
pl.scan_csv(xref_urs.name, has_header=False, low_memory=True)
.unique()
Expand All @@ -162,7 +161,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):

raw_tracked = pl.scan_csv(
tracked_urs.name, low_memory=True
).unique() ## May not need to be uniqued?
).unique()

to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")

Expand Down
2 changes: 1 addition & 1 deletion rnacentral_pipeline/rnacentral/r2dt/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def parse(
old_model_name = model_name
model_name = temp_model_name_lookup.get(model_name, None)
if model_name is None:
raise ValueError("No info for model %s", old_model_name)
raise ValueError(f"No info for model {old_model_name}")

minfo = model_info[model_name]
info = data.R2DTResultInfo(urs, minfo, source, result_base)
Expand Down
4 changes: 4 additions & 0 deletions rnacentral_pipeline/rnacentral/search_export/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,9 @@ def has_publications(counts):
def has_litsumm(litsumm):
return str(bool(litsumm))

def has_go_flow_llm_annotation(go_flow):
return str(bool(go_flow))


def has_editing_event(editing_events):
return str(bool(editing_events))
Expand Down Expand Up @@ -881,6 +884,7 @@ def edit_ref_to_edit(editing_events):
edit_repeat_type,
keys="editing_events",
),
field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The key goflow used here does not match the field name go_flow_llm_annotations in the Normalized Rust struct that generates the JSON data. This will result in a KeyError when the Python script tries to access the data. To ensure consistency with other fields like litsumm and editing_events, I've suggested a change in the Rust code to rename the field to goflow during serialization. Alternatively, you could change this key to go_flow_llm_annotations.

## Add new fields above this line! Otherwise editing the produced xml is hard.
tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
],
Expand Down
9 changes: 9 additions & 0 deletions utils/search-export/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub enum Groupable {
SoInfo,
LitsummSummaries,
EditingEvents,
GoFlowAnnotation,
}

#[derive(Debug, StructOpt)]
Expand Down Expand Up @@ -140,6 +141,10 @@ enum SequenceCommand {
/// RNA editing events
editing_events: PathBuf,

#[structopt(parse(from_os_str))]
/// GoFlowLLM annotations
go_flow_llm_annotations: PathBuf,

// Add new arguments above this line!
#[structopt(parse(from_os_str))]
/// Filename to write the results to, '-' means stdout
Expand Down Expand Up @@ -255,6 +260,7 @@ fn main() -> Result<()> {
Groupable::EditingEvents => {
sequences::editing_events::group(&path, max_count, &output)?
},
Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?,
},
Subcommand::Sequences {
command,
Expand All @@ -275,6 +281,8 @@ fn main() -> Result<()> {
litsumm_summaries,
editing_events,
so_term_tree,
go_flow_llm_annotations,
// Add new arguments above this line!
output,
} => sequences::writers::write_merge(
vec![
Expand All @@ -293,6 +301,7 @@ fn main() -> Result<()> {
editing_events,
orfs,
so_term_tree,
go_flow_llm_annotations,
],
&output,
)?,
Expand Down
15 changes: 14 additions & 1 deletion utils/search-export/src/sequences/file_joiner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use super::{
rfam_hit::RfamHit,
so_tree,
so_tree::SoMapping,
go_flow_annotations::GoFlowLLMAnnotation,
};

#[derive(Debug, Error)]
Expand Down Expand Up @@ -98,6 +99,7 @@ pub enum FileTypes {
PublicationCount,
LitsummSummaries,
EditingEvents,
GoFlowLLMAnnotations,
SoTermTree,
}

Expand All @@ -116,6 +118,7 @@ pub struct FileJoiner<'de> {
rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
go_flow_llm_annotations: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
so_info: SoMapping,
}
Expand Down Expand Up @@ -203,6 +206,7 @@ impl FileJoinerBuilder {
let publication_counts = self.iterator_for(FileTypes::PublicationCount)?;
let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?;
let editing_events = self.iterator_for(FileTypes::EditingEvents)?;
let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?;
let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?;

Ok(FileJoiner {
Expand All @@ -220,6 +224,7 @@ impl FileJoinerBuilder {
publication_counts,
lit_summ,
editing_events,
go_flow_llm_annotations,
so_info,
})
}
Expand All @@ -244,6 +249,7 @@ impl<'de> Iterator for FileJoiner<'de> {
self.publication_counts.next(),
self.lit_summ.next(),
self.editing_events.next(),
self.go_flow_llm_annotations.next(),
);

match current {
Expand All @@ -262,6 +268,7 @@ impl<'de> Iterator for FileJoiner<'de> {
None,
None,
None,
None,
) => None,
(
Some(Ok(Required {
Expand Down Expand Up @@ -320,6 +327,10 @@ impl<'de> Iterator for FileJoiner<'de> {
id: id14,
data: editing_events,
})),
Some(Ok(Multiple {
id: id15,
data: goflow_llm_annotations,
})),
) => {
if id1 != id2
|| id1 != id3
Expand All @@ -334,9 +345,10 @@ impl<'de> Iterator for FileJoiner<'de> {
|| id1 != id12
|| id1 != id13
|| id1 != id14
|| id1 != id15
{
return Some(Err(Error::OutofSyncData(vec![
id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15
])));
}

Expand All @@ -362,6 +374,7 @@ impl<'de> Iterator for FileJoiner<'de> {
.publication_counts(publication_counts)
.litsumm_summaries(lit_summ)
.editing_events(editing_events)
.go_flow_llm_annotations(goflow_llm_annotations)
.so_tree(so_tree)
.build();

Expand Down
34 changes: 34 additions & 0 deletions utils/search-export/src/sequences/go_flow_annotations.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use serde::{
Deserialize,
Serialize,
};
use std::path::Path;

use anyhow::Result;
use rnc_core::grouper;

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct GoFlowLLMAnnotation {
pub id: usize,
urs_taxid: String,
should_show_goflow: bool,
}

impl grouper::HasIndex for GoFlowLLMAnnotation {
fn index(&self) -> usize {
self.id
}
}

pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
grouper::group::<GoFlowLLMAnnotation>(grouper::Criteria::AnyNumber, &path, 1, max, &output)
}

impl GoFlowLLMAnnotation {
pub fn should_show_goflow(&self) -> bool {
self.should_show_goflow
}
pub fn urs_taxid(&self) -> &str {
&self.urs_taxid
}
}
1 change: 1 addition & 0 deletions utils/search-export/src/sequences/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ pub mod qa_status;
pub mod r2dt;
pub mod raw;
pub mod rfam_hit;
pub mod go_flow_annotations;
pub mod so_tree;
pub mod writers;
3 changes: 3 additions & 0 deletions utils/search-export/src/sequences/normalized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::sequences::{
r2dt::R2dt,
raw::Raw,
rfam_hit::RfamHitVec,
go_flow_annotations::GoFlowLLMAnnotation,
so_tree,
};

Expand Down Expand Up @@ -69,6 +70,7 @@ pub struct Normalized {
publication_count: usize,
litsumm: Vec<LitsummSummaries>,
editing_events: Vec<EditingEvent>,
go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
so_rna_type_tree: so_tree::SoTree,

#[serde(flatten)]
Expand Down Expand Up @@ -129,6 +131,7 @@ impl Normalized {
rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(),
orfs: raw.orfs().to_vec().into_iter().collect(),
litsumm: raw.litsumm_summaries().to_vec(),
go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(),
editing_events: raw.editing_events().to_vec(),
})
}
Expand Down
Loading