CellArr · jkanche · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/README.md b/README.md
@@ -67,6 +67,9 @@ dataset = garr.build_genomicarray(
      files=bw_files,
      output_path=tempdir,
      features=features,
+     # Specify a fasta file to extract sequences
+     # for each region in features
+     genome_fasta="path/to/genome.fasta",
      # agg function to summarize mutiple values
      # from bigwig within an input feature interval.
      feature_annotation_options=garr.FeatureAnnotationOptions(

diff --git a/setup.cfg b/setup.cfg
@@ -57,7 +57,7 @@ install_requires =
     pyarrow>=1.0
     mopsy
     genomicranges
-
+    pyfaidx
 
 [options.packages.find]
 where = src
@@ -68,6 +68,9 @@ exclude =
 # Add here additional requirements for extra features, to install with:
 # `pip install GenomicArrays[PDF]` like:
 # PDF = ReportLab; RXP
+optional =
+    torch
+    pytorch-lightning
 
 # Add here test requirements (semicolon/line-separated)
 testing =

diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py
@@ -42,6 +42,7 @@
 
 import pandas as pd
 from cellarr import buildutils_tiledb_frame as utf
+from pyfaidx import Fasta
 
 from . import build_options as bopt
 from . import buildutils_tiledb_array as uta
@@ -58,7 +59,8 @@ def build_genomicarray(
     files: list,
     output_path: str,
     features: Union[str, pd.DataFrame],
-    genome: Union[str, pd.DataFrame] = "hg38",
+    genome_fasta: str,
+    # genome: Union[str, pd.DataFrame] = "hg38",
     sample_metadata: Union[pd.DataFrame, str] = None,
     sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(),
     matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(),
@@ -88,15 +90,17 @@ def build_genomicarray(
             the first row is expected to contain the column names,
             "seqnames", "starts" and "ends".
 
-        genome:
-            A string specifying the genome to automatically download the
-            chromosome sizes from ucsc.
+            Additionally, the file may contain a column `sequences`,
+            to specify the sequence string for each region. Otherwise,
+            provide a link to the fasta file using the ``genome_fasta``
+            parameter.
 
-            Alternatively, may provide a :py:class:`~pandas.DataFrame`
-            containing columns 'seqnames' and 'lengths'.
+        genome_fasta:
+            Path to a fasta file containing the sequence information.
 
-            Note: This parameter is currently not used. Ideally this will
-            be used to truncate the regions.
+            Sequence information will be updated for each region
+            in ``features`` if the DataFrame/path does not contain
+            a column `sequences`.
 
         sample_metadata:
             A :py:class:`~pandas.DataFrame` containing the sample
@@ -169,26 +173,27 @@ def build_genomicarray(
         if not required_cols.issubset(input_intervals.columns):
             missing = required_cols - set(input_intervals.columns)
             raise ValueError(f"Missing required columns: {missing}")
-
     else:
-        raise TypeError(
-            "'input_intervals' is not an expected type (either 'str' or 'Dataframe')."
-        )
+        raise TypeError("'input_intervals' is not an expected type (either 'str' or 'Dataframe').")
 
     if not feature_annotation_options.skip:
-        _col_types = utf.infer_column_types(
-            input_intervals, feature_annotation_options.column_types
-        )
+
+        if "sequence" not in input_intervals.columns:
+            gen_fa = Fasta(genome_fasta, as_raw=True)
+            sequences = []
+            for row in input_intervals.itertuples():
+                seq = gen_fa.get_seq(row.seqnames, row.starts, row.ends)
+                sequences.append(seq)
+
+            input_intervals["sequences"] = sequences
+
+        _col_types = utf.infer_column_types(input_intervals, feature_annotation_options.column_types)
 
         if "genarr_feature_index" not in input_intervals.columns:
             input_intervals["genarr_feature_index"] = range(0, len(input_intervals))
 
-        _feature_output_uri = (
-            f"{output_path}/{feature_annotation_options.tiledb_store_name}"
-        )
-        utf.create_tiledb_frame_from_dataframe(
-            _feature_output_uri, input_intervals, column_types=_col_types
-        )
+        _feature_output_uri = f"{output_path}/{feature_annotation_options.tiledb_store_name}"
+        utf.create_tiledb_frame_from_dataframe(_feature_output_uri, input_intervals, column_types=_col_types)
 
         if optimize_tiledb:
             uta.optimize_tiledb_array(_feature_output_uri)
@@ -218,16 +223,10 @@ def build_genomicarray(
         raise TypeError("'sample_metadata' is not an expected type.")
 
     if not sample_metadata_options.skip:
-        _col_types = utf.infer_column_types(
-            sample_metadata, sample_metadata_options.column_types
-        )
+        _col_types = utf.infer_column_types(sample_metadata, sample_metadata_options.column_types)
 
-        _sample_output_uri = (
-            f"{output_path}/{sample_metadata_options.tiledb_store_name}"
-        )
-        utf.create_tiledb_frame_from_dataframe(
-            _sample_output_uri, sample_metadata, column_types=_col_types
-        )
+        _sample_output_uri = f"{output_path}/{sample_metadata_options.tiledb_store_name}"
+        utf.create_tiledb_frame_from_dataframe(_sample_output_uri, sample_metadata, column_types=_col_types)
 
         if optimize_tiledb:
             uta.optimize_tiledb_array(_sample_output_uri)
@@ -289,6 +288,4 @@ def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func):
 def _wrapper_extract_bwinfo(args):
     """Wrapper for multiprocessing multiple files and intervals."""
     counts_uri, input_intervals, bwpath, idx, agg_func = args
-    return _write_intervals_to_tiledb(
-        counts_uri, input_intervals, bwpath, idx, agg_func
-    )
+    return _write_intervals_to_tiledb(counts_uri, input_intervals, bwpath, idx, agg_func)
diff --git a/tests/data/notes.md b/tests/data/notes.md
@@ -0,0 +1,19 @@
+To download the sequence for the tests
+
+1. Download the binary from ucsc tools for your operating system, for a mac with arm processor:
+
+```sh
+wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.arm64/twoBitToFa
+```
+
+2. Update permissions to execute the binary:
+
+```sh
+chmod a+x twoBitToFa
+```
+
+3. Download the fasta for the region of interest:
+
+```sh
+./twoBitToFa http://hgdownload.cse.ucsc.edu/gbdb/hg19/hg19.2bit test.fa -seq=chr1 -start=250 -end=650
+```
diff --git a/tests/data/test.fa b/tests/data/test.fa
@@ -0,0 +1,20 @@
+>chr1
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
diff --git a/tests/data/test.fa.fai b/tests/data/test.fa.fai
@@ -0,0 +1 @@
+chr1	950	6	50	51
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -23,6 +23,7 @@ def test_ingest_bigwigs():
         output_path=tempdir,
         files=["tests/data/test1.bw", "tests/data/test2.bw"],
         features=features,
+        genome_fasta="tests/data/test.fa"
     )
 
     cfp = tiledb.open(f"{tempdir}/coverage", "r")

diff --git a/tests/test_query.py b/tests/test_query.py
@@ -24,6 +24,7 @@ def test_query():
         output_path=tempdir,
         files=["tests/data/test1.bw", "tests/data/test2.bw"],
         features=features,
+        genome_fasta="tests/data/test.fa"
     )
 
     assert dataset is not None
@@ -69,7 +70,7 @@ def test_query():
     assert len(cd.get_sample_subset("genarr_sample == 'sample_1'")) == 1
 
     assert sorted(cd.get_feature_annotation_columns()) == sorted(
-        ["seqnames", "starts", "ends", "genarr_feature_index"]
+        ["seqnames", "starts", "ends", "genarr_feature_index", "sequences"]
     )
     assert len(cd.get_feature_annotation_column("genarr_feature_index")) == 15
     assert len(cd.get_feature_subset("genarr_feature_index == 1")) == 1