From 0904d98e8aa3e5304e85a3a1b2eaf16ae1f505ff Mon Sep 17 00:00:00 2001 From: camiel-m Date: Wed, 26 Jul 2023 15:19:09 +0200 Subject: [PATCH] make SangerDB optional --- chromograph.egg-info/PKG-INFO | 4 +-- chromograph.egg-info/requires.txt | 2 +- chromograph/preprocessing/Chromgen.py | 46 +++++++-------------------- 3 files changed, 14 insertions(+), 38 deletions(-) diff --git a/chromograph.egg-info/PKG-INFO b/chromograph.egg-info/PKG-INFO index d31ad4e..9e1616e 100644 --- a/chromograph.egg-info/PKG-INFO +++ b/chromograph.egg-info/PKG-INFO @@ -1,4 +1,4 @@ -Metadata-Version: 1.0 +Metadata-Version: 2.1 Name: chromograph Version: 0.0.1 Summary: Pipeline for single-cell ATAC-seq analysis @@ -6,5 +6,3 @@ Home-page: https://github.com/linnarsson-lab/chromograph Author: Linnarsson Lab Author-email: camiel.mannens@ki.se License: MIT -Description: UNKNOWN -Platform: UNKNOWN diff --git a/chromograph.egg-info/requires.txt b/chromograph.egg-info/requires.txt index 117c494..6766c06 100644 --- a/chromograph.egg-info/requires.txt +++ b/chromograph.egg-info/requires.txt @@ -1,7 +1,7 @@ loompy numpy scikit-learn -scipy==1.4.1 +scipy matplotlib networkx python-louvain diff --git a/chromograph/preprocessing/Chromgen.py b/chromograph/preprocessing/Chromgen.py index f51b70e..7e27a33 100644 --- a/chromograph/preprocessing/Chromgen.py +++ b/chromograph/preprocessing/Chromgen.py @@ -2,7 +2,7 @@ import numpy as np import os -import sys +import yaml import pybedtools from pybedtools import BedTool import collections @@ -43,37 +43,8 @@ def __init__(self) -> None: self.RNA_file = '' pybedtools.helpers.set_bedtools_path(self.config.paths.bedtools) logging.info("Chromgen initialised") - - # def fragments_to_count(self, ff, outdir, meta, bsize, chromosomes): - # ''' - # ''' - # ## Read Fragments and generate size bins - # logging.info("Read fragments into dict") - # frag_dict = read_fragments(ff) - - # ## Split fragments to seperate files for fast indexing - # logging.info(f"Saving fragments to separate folder for fast indexing") - # fdir = os.path.join(outdir, 'fragments') - # if not os.path.isdir(fdir): - # os.mkdir(fdir) - # if len(os.listdir(fdir)) < len(meta['barcode']): - # i = 0 - # for x in meta['barcode']: - # f = os.path.join(fdir, f'{x}.tsv.gz') - # if not os.path.exists(f): - # frags = BedTool(frag_dict[x]).filter(lambda x: x[0] in chromosomes.keys()).saveas(f) - # i += 1 - # if i%1000 == 0: - # logging.info(f'Finished separating fragments for {i} cells') - - # ## Count fragments inside bins - # logging.info("Count fragments overlapping with bins") - # Count_dict = count_bins(frag_dict, meta['barcode'], bsize) - # logging.info("Finished counting fragments") - - # return Count_dict - - def fit(self, indir: str, bsize: int = 5000, outdir: str = None, genome_size: str = None, blacklist: str = None, min_fragments: bool = False) -> None: + + def fit(self, indir: str, bsize: int = 5000, outdir: str = None, genome_size: str = None, blacklist: str = None, min_fragments: bool = False, path_meta: str = None) -> None: '''' Create a .loom file from 10X Genomics cellranger output with reads binned Args: @@ -82,6 +53,8 @@ def fit(self, indir: str, bsize: int = 5000, outdir: str = None, genome_size: st outdir (str): output folder wher the new loom file should be saved (default to indir) genome_size (str): path to file containing chromosome sizes, usually derived from encode (i.e. 'hg19.chrom.sizes.tsv') blacklist (str): path to bedfile containing blacklisted region (i.e. 'blacklist_hg19.bed') + path_meta (str): If a defined meta data file (.yaml format) should be used instead of the default database define path here (None for sangerDB), + at minimum use a file defining the sample name Returns: path (str): Full path to the created loom file. Remarks: @@ -169,8 +142,13 @@ def fit(self, indir: str, bsize: int = 5000, outdir: str = None, genome_size: st meta['CellID'] = np.array([x.split('-')[0] for x in meta['CellID']]) ## Retrieve sample metadata from SangerDB - logging.info(f'Retrieve metadata from {[self.config.paths.metadata, sample]}') - m = load_sample_metadata(self.config.paths.metadata, sample) + if not path_meta: + logging.info(f'Retrieve metadata from {[self.config.paths.metadata, sample]}') + m = load_sample_metadata(self.config.paths.metadata, sample) + else: + logging.info(f'Retrieve metadata from {path_meta}') + with open(path_meta, 'r') as file: + m = yaml.load(file, Loader=yaml.FullLoader) for k,v in m.items(): meta[k] = np.array([v] * len(meta['barcode']))