Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract sequence information #8

Merged
merged 3 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ dataset = garr.build_genomicarray(
files=bw_files,
output_path=tempdir,
features=features,
# Specify a fasta file to extract sequences
# for each region in features
genome_fasta="path/to/genome.fasta",
# agg function to summarize mutiple values
# from bigwig within an input feature interval.
feature_annotation_options=garr.FeatureAnnotationOptions(
Expand Down
5 changes: 4 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ install_requires =
pyarrow>=1.0
mopsy
genomicranges

pyfaidx

[options.packages.find]
where = src
Expand All @@ -68,6 +68,9 @@ exclude =
# Add here additional requirements for extra features, to install with:
# `pip install GenomicArrays[PDF]` like:
# PDF = ReportLab; RXP
optional =
torch
pytorch-lightning

# Add here test requirements (semicolon/line-separated)
testing =
Expand Down
63 changes: 30 additions & 33 deletions src/genomicarrays/build_genomicarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

import pandas as pd
from cellarr import buildutils_tiledb_frame as utf
from pyfaidx import Fasta

from . import build_options as bopt
from . import buildutils_tiledb_array as uta
Expand All @@ -58,7 +59,8 @@ def build_genomicarray(
files: list,
output_path: str,
features: Union[str, pd.DataFrame],
genome: Union[str, pd.DataFrame] = "hg38",
genome_fasta: str,
# genome: Union[str, pd.DataFrame] = "hg38",
sample_metadata: Union[pd.DataFrame, str] = None,
sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(),
matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(),
Expand Down Expand Up @@ -88,15 +90,17 @@ def build_genomicarray(
the first row is expected to contain the column names,
"seqnames", "starts" and "ends".

genome:
A string specifying the genome to automatically download the
chromosome sizes from ucsc.
Additionally, the file may contain a column `sequences`,
to specify the sequence string for each region. Otherwise,
provide a link to the fasta file using the ``genome_fasta``
parameter.

Alternatively, may provide a :py:class:`~pandas.DataFrame`
containing columns 'seqnames' and 'lengths'.
genome_fasta:
Path to a fasta file containing the sequence information.

Note: This parameter is currently not used. Ideally this will
be used to truncate the regions.
Sequence information will be updated for each region
in ``features`` if the DataFrame/path does not contain
a column `sequences`.

sample_metadata:
A :py:class:`~pandas.DataFrame` containing the sample
Expand Down Expand Up @@ -169,26 +173,27 @@ def build_genomicarray(
if not required_cols.issubset(input_intervals.columns):
missing = required_cols - set(input_intervals.columns)
raise ValueError(f"Missing required columns: {missing}")

else:
raise TypeError(
"'input_intervals' is not an expected type (either 'str' or 'Dataframe')."
)
raise TypeError("'input_intervals' is not an expected type (either 'str' or 'Dataframe').")

if not feature_annotation_options.skip:
_col_types = utf.infer_column_types(
input_intervals, feature_annotation_options.column_types
)

if "sequence" not in input_intervals.columns:
gen_fa = Fasta(genome_fasta, as_raw=True)
sequences = []
for row in input_intervals.itertuples():
seq = gen_fa.get_seq(row.seqnames, row.starts, row.ends)
sequences.append(seq)

input_intervals["sequences"] = sequences

_col_types = utf.infer_column_types(input_intervals, feature_annotation_options.column_types)

if "genarr_feature_index" not in input_intervals.columns:
input_intervals["genarr_feature_index"] = range(0, len(input_intervals))

_feature_output_uri = (
f"{output_path}/{feature_annotation_options.tiledb_store_name}"
)
utf.create_tiledb_frame_from_dataframe(
_feature_output_uri, input_intervals, column_types=_col_types
)
_feature_output_uri = f"{output_path}/{feature_annotation_options.tiledb_store_name}"
utf.create_tiledb_frame_from_dataframe(_feature_output_uri, input_intervals, column_types=_col_types)

if optimize_tiledb:
uta.optimize_tiledb_array(_feature_output_uri)
Expand Down Expand Up @@ -218,16 +223,10 @@ def build_genomicarray(
raise TypeError("'sample_metadata' is not an expected type.")

if not sample_metadata_options.skip:
_col_types = utf.infer_column_types(
sample_metadata, sample_metadata_options.column_types
)
_col_types = utf.infer_column_types(sample_metadata, sample_metadata_options.column_types)

_sample_output_uri = (
f"{output_path}/{sample_metadata_options.tiledb_store_name}"
)
utf.create_tiledb_frame_from_dataframe(
_sample_output_uri, sample_metadata, column_types=_col_types
)
_sample_output_uri = f"{output_path}/{sample_metadata_options.tiledb_store_name}"
utf.create_tiledb_frame_from_dataframe(_sample_output_uri, sample_metadata, column_types=_col_types)

if optimize_tiledb:
uta.optimize_tiledb_array(_sample_output_uri)
Expand Down Expand Up @@ -289,6 +288,4 @@ def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func):
def _wrapper_extract_bwinfo(args):
"""Wrapper for multiprocessing multiple files and intervals."""
counts_uri, input_intervals, bwpath, idx, agg_func = args
return _write_intervals_to_tiledb(
counts_uri, input_intervals, bwpath, idx, agg_func
)
return _write_intervals_to_tiledb(counts_uri, input_intervals, bwpath, idx, agg_func)
19 changes: 19 additions & 0 deletions tests/data/notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
To download the sequence for the tests

1. Download the binary from ucsc tools for your operating system, for a mac with arm processor:

```sh
wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.arm64/twoBitToFa
```

2. Update permissions to execute the binary:

```sh
chmod a+x twoBitToFa
```

3. Download the fasta for the region of interest:

```sh
./twoBitToFa http://hgdownload.cse.ucsc.edu/gbdb/hg19/hg19.2bit test.fa -seq=chr1 -start=250 -end=650
```
20 changes: 20 additions & 0 deletions tests/data/test.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
>chr1
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
1 change: 1 addition & 0 deletions tests/data/test.fa.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr1 950 6 50 51
1 change: 1 addition & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_ingest_bigwigs():
output_path=tempdir,
files=["tests/data/test1.bw", "tests/data/test2.bw"],
features=features,
genome_fasta="tests/data/test.fa"
)

cfp = tiledb.open(f"{tempdir}/coverage", "r")
Expand Down
3 changes: 2 additions & 1 deletion tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def test_query():
output_path=tempdir,
files=["tests/data/test1.bw", "tests/data/test2.bw"],
features=features,
genome_fasta="tests/data/test.fa"
)

assert dataset is not None
Expand Down Expand Up @@ -69,7 +70,7 @@ def test_query():
assert len(cd.get_sample_subset("genarr_sample == 'sample_1'")) == 1

assert sorted(cd.get_feature_annotation_columns()) == sorted(
["seqnames", "starts", "ends", "genarr_feature_index"]
["seqnames", "starts", "ends", "genarr_feature_index", "sequences"]
)
assert len(cd.get_feature_annotation_column("genarr_feature_index")) == 15
assert len(cd.get_feature_subset("genarr_feature_index == 1")) == 1
Expand Down