diff --git a/.github/workflows/run-codecov.yml b/.github/workflows/run-codecov.yml index a41a1fd..1db19ff 100644 --- a/.github/workflows/run-codecov.yml +++ b/.github/workflows/run-codecov.yml @@ -2,7 +2,7 @@ name: Run codecov on: pull_request: - branches: [master] + branches: [master, dev] jobs: pytest: diff --git a/MANIFEST.in b/MANIFEST.in index e704c91..4f3018a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,4 @@ include README.md include docs/img/geofetch_logo.svg include geofetch/config_template.yaml include geofetch/config_processed_template.yaml +include geofetch/looper_sra_convert.yaml diff --git a/README.md b/README.md index 0b84e33..370cd38 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,22 @@ # geofetch logo -[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) +[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](https://pepkit.github.io) ![Run pytests](https://github.com/pepkit/geofetch/workflows/Run%20pytests/badge.svg) -[![docs-badge](https://readthedocs.org/projects/geofetch/badge/?version=latest)](http://geofetch.databio.org/en/latest/) +[![docs-badge](https://readthedocs.org/projects/geofetch/badge/?version=latest)](https://geofetch.databio.org/en/latest/) [![pypi-badge](https://img.shields.io/pypi/v/geofetch)](https://pypi.org/project/geofetch) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -`geofetch` is a command-line tool that downloads sequencing data and metadata from GEO and SRA and creates [standard PEPs](http://pep.databio.org/). `geofetch` is hosted at [pypi](https://pypi.org/project/geofetch/) and documentation is hosted at [geofetch.databio.org](http://geofetch.databio.org) (source in the [/docs](/docs) folder). +`geofetch` is a command-line tool that downloads sequencing data and metadata from GEO and SRA and creates [standard PEPs](https://pep.databio.org/). `geofetch` is hosted at [pypi](https://pypi.org/project/geofetch/). You can convert the result of geofetch into unmapped `bam` or `fastq` files with the included `sraconvert` command. -You can convert the result of geofetch into unmapped `bam` or `fastq` files with the included `sraconvert` command. +Key geofetch features: + +- Works with GEO and SRA metadata +- Combines samples from different projects +- Standardizes output metadata +- Filters type and size of processed files (from GEO) before downloading them +- Easy to use +- Fast execution time +- Can search GEO to find relevant data +- Can be used either as a command-line tool or from within Python using an API + +For more information, see [complete documentation at geofetch.databio.org](http://geofetch.databio.org) (source in the [/docs](/docs) folder). diff --git a/docs/README.md b/docs/README.md index 1cc4626..7b87b32 100644 --- a/docs/README.md +++ b/docs/README.md @@ -12,6 +12,19 @@ - Produce a standardized [PEP](http://pepkit.github.io) sample table. This makes it really easy to run [looper](https://pepkit.github.io/docs/looper/)-compatible pipelines on public datasets by handling data acquisition and metadata formatting and standardization for you. - Prepare a project to run with [sraconvert](sra_convert.md) to convert SRA files into FASTQ files. +![](./img/pipeline.svg) + +Key geofetch advantages: + +- Works with GEO and SRA metadata +- Combines samples from different projects +- Standardizes output metadata +- Filters type and size of processed files (from GEO) before downloading them +- Easy to use +- Fast execution time +- Can search GEO to find relevant data +- Can be used either as a command-line tool or from within Python using an API + ## Quick example `geofetch` runs on the command line. This command will download the raw data and metadata for the given GSE number. @@ -38,5 +51,37 @@ geofetch -i GSE95654 --just-metadata geofetch -i GSE95654 --processed --just-metadata ``` +### Check out what exactly argument you want to use to download data: + +![](./img/arguments_outputs.svg) + +--- +### New features available in geofetch 0.11.0: +1) Now geofetch is available as Python API package. Geofetch can initialize [peppy](http://peppy.databio.org/) projects without downloading any soft files. Example: + +```python +from geofetch import Geofetcher + +# initiate Geofetcher with all necessary arguments: +geof = Geofetcher(processed=True, acc_anno=True, discard_soft=True) + +# get projects by providing as input GSE or file with GSEs +geof.get_projects("GSE160204") +``` + +2) Now to find GSEs and save them to file you can use `Finder` - GSE finder tool: + +```python +from geofetch import Finder + +# initiate Finder (use filters if necessary) +find_gse = Finder(filters='bed') + +# get all projects that were found: +gse_list = find_gse.get_gse_all() +``` +Find more information here: [GSE Finder](./gse_finder.md) + + For more details, check out the [usage](usage.md) reference, [installation instructions](install.md), or head on over to the [tutorial for raw data](raw-data-downloading.md) and [tutorial for processed data](processed-data-downloading.md) for a detailed walkthrough. diff --git a/docs/changelog.md b/docs/changelog.md index 2adcd81..7f2ef68 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,18 @@ # Changelog +## [0.11.0] -- 2022-10-26 +- Added initialization of peppy Project without saving any files (from within Python using an API) +- Added Finder (searching GSE tool) +- Added progress bar +- Switched way of saving soft files to request library +- Improved documentation +- Refactored code +- Added `--add-convert-modifier` flag +- fixed looper amendments in the config file +- Fixed special character bug in the config file +- Fixed None issue in config file +- Fixed saving raw peps bug + ## [0.10.1] -- 2022-08-04 - Updated metadata fetching requests from SRA database diff --git a/docs/gse_finder.md b/docs/gse_finder.md new file mode 100644 index 0000000..14a353f --- /dev/null +++ b/docs/gse_finder.md @@ -0,0 +1,81 @@ +is a geofetch class that provides functions to find and retrieve a list of GSE ([GEO](https://www.ncbi.nlm.nih.gov/geo/) accession number) by using NCBI searching tool. + + +### The main features of the geofetch Finder are: +- Find GEO accession numbers (GSE) of the project that were uploaded or updated in certain period of time. +- Use the same filter query as [GEO DataSets Advanced Search Builder](https://www.ncbi.nlm.nih.gov/gds/advanced) is using +- Save list of the GSEs to file (This file with geo can be used later in **[geofetch](http://geofetch.databio.org/en/latest/)**) +- Easier and faster to get GSEs using NCBI filter and certain period of time. + + +___ +## Tutorial + +0) Initiale Finder object. +```python +from geofetch import Finder +gse_obj = Finder() + +# Optionally: provide filter string and max number of retrieve elements +gse_obj = Finder(filter="((bed) OR narrow peak) AND Homo sapiens[Organism]", retmax=10) +``` + +1) Get list of all GSE in GEO +```python + +gse_list = gse_obj.get_gse_all() + +``` + +2) Get list of GSE that were uploaded and updated last week +```python + +gse_list = gse_obj.get_gse_last_week() + +``` + +3) Get list of GSE that were uploaded and updated last 3 month +```python + +gse_list = gse_obj.get_gse_last_3_month() + +``` + +4) Get list of GSE that were uploaded and updated in las *number of days* +```python + +# project that were uploaded in last 5 days: +gse_list = gse_obj.get_gse_by_day_count(5) + +``` + +5) Get list of GSE that were uploaded in certain period of time +```python + +gse_list = gse_obj.get_gse_by_date(start_date="2015/05/05", end_date="2020/05/05") + +``` + +6) Save last searched list of items to the file +```python + +gse_obj.generate_file("path/to/the/file") + +# if you want to save different list of files you can provide it to the funciton +gse_obj.generate_file("path/to/the/file", gse_list=["123", "124"]) + +``` + +7) Compare two lists: +```python + +new_gse_list = gse_obj.find_differences(list1, list2) + +``` + +---- + +More information about gse and queries and id: +- https://www.ncbi.nlm.nih.gov/geo/info/geo_paccess.html +- https://newarkcaptain.com/how-to-retrieve-ncbi-geo-information-using-apis-part1/ +- https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag \ No newline at end of file diff --git a/docs/img/arguments_outputs.svg b/docs/img/arguments_outputs.svg new file mode 100644 index 0000000..89cb3c5 --- /dev/null +++ b/docs/img/arguments_outputs.svg @@ -0,0 +1,8186 @@ + + + +--processed --just-metadata --data-source samples--processed--data-source samples--processed --just-metadata--data-source series--processed--data-source series--processed--data-source all--processed --just-metadata --data-source allArguments#12345678--just-metadataOutputData SourceSamplesSamplesSeriesSeriesallallSamplesSamplesMetadataProcessedProcessedProcessedProcessedProcessedProcessedRawRawDataProcessedProcessedProcessedNoneNoneNoneRawNone diff --git a/docs/img/pipeline.svg b/docs/img/pipeline.svg new file mode 100644 index 0000000..ef4d501 --- /dev/null +++ b/docs/img/pipeline.svg @@ -0,0 +1,8072 @@ + + + +geofetchgeofetchPortableEncapsulatedProjectpeppypeprGEO/SRAData diff --git a/docs/install.md b/docs/install.md index 355cd70..01c9c12 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,21 +1,5 @@ # Installing geofetch -## Prerequisites - -You must have the [sratoolkit from NCBI](https://www.ncbi.nlm.nih.gov/books/NBK158900/) installed, with the tools in your PATH. Once it's installed, you should check to make sure you can run `prefetch`. Also, make sure it's configured to store SRA files where you want them. For more information, see how to change sratools download location. - -## Setting data download location for `sratools` - -`geofetch` is using the [sratoolkit](https://trace.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc&f=std) to download raw data from SRA -- which means it's stuck with the [default path for downloading SRA data](http://databio.org/posts/downloading_sra_data.html), which is in your home directory. So before you run `geofetch`, make sure you have set up your download location to the correct place. In our group, we use a shared group environment variable called `${SRARAW}`, which points to a shared folder (`${DATA}/sra`) where the whole group has access to downloaded SRA data. You can point the `sratoolkit` (and therefore `geofetch`) to use that location with this one-time configuration code: - -``` -echo "/repository/user/main/public/root = \"$DATA\"" > ${HOME}/.ncbi/user-settings.mkfg -``` - -Now `sratoolkit` will download data into an `/sra` folder in `${DATA}`, which is what `${SRARAW}` points to. - -If you are getting an error that the `.ncbi` folder does not exist in your home directory, you can just make a folder `.ncbi` with an empty file `user-settings.mkfg` and follow the same command above. - ## Installing geofetch Releases are posted as [GitHub releases](https://github.com/pepkit/geofetch/releases), or you can install from PyPI using `pip`: @@ -35,3 +19,19 @@ If the executable in not in your $PATH, append this to your `.bashrc` or `.profi ``` export PATH=~/.local/bin:$PATH ``` + +## Prerequisites for SRA data downloading + +To download **raw data** You must have the [sratoolkit from NCBI](https://www.ncbi.nlm.nih.gov/books/NBK158900/) installed, with the tools in your PATH. Once it's installed, you should check to make sure you can run `prefetch`. Also, make sure it's configured to store SRA files where you want them. For more information, see how to change sratools download location. + +## Setting data download location for `sratools` + +`geofetch` is using the [sratoolkit](https://trace.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc&f=std) to download raw data from SRA -- which means it's stuck with the [default path for downloading SRA data](http://databio.org/posts/downloading_sra_data.html), which is in your home directory. So before you run `geofetch`, make sure you have set up your download location to the correct place. In our group, we use a shared group environment variable called `${SRARAW}`, which points to a shared folder (`${DATA}/sra`) where the whole group has access to downloaded SRA data. You can point the `sratoolkit` (and therefore `geofetch`) to use that location with this one-time configuration code: + +``` +echo "/repository/user/main/public/root = \"$DATA\"" > ${HOME}/.ncbi/user-settings.mkfg +``` + +Now `sratoolkit` will download data into an `/sra` folder in `${DATA}`, which is what `${SRARAW}` points to. + +If you are getting an error that the `.ncbi` folder does not exist in your home directory, you can just make a folder `.ncbi` with an empty file `user-settings.mkfg` and follow the same command above. \ No newline at end of file diff --git a/docs/metadata_output.md b/docs/metadata_output.md index 3c2eace..5763b2f 100644 --- a/docs/metadata_output.md +++ b/docs/metadata_output.md @@ -1,24 +1,46 @@ # Metadata output -For each GSE input accession (ACC), `geofetch` produces: +Geofetch produces [PEPs](http://pep.databio.org/) for either processed or raw data (including metadata from SRA). +A project can be created either for a single combined (whole) input or for each project separately. +(if `--acc-anno` is set). "combined" means that it will have rows for every sample in every GSE included +in your input. So if you just gave a single GSE, then the combined file is the same as the GSE file. + +**For raw data**: a metadata file will be created including SRA and GSM annotation. + +**For processed data**: a metadata file will be created just for GSE and GSM annotation. User +can choose which data should he download. There are 3 downloading options for processed: samples, series and both. + +### Single PEP will contain: +- project_name.csv - all metadata for sample processed data +- project_name_subannotation.csv (*just for raw data*) - for *merged* samples +(samples for which there are multiple SRR Runs for a single SRX `Experiment`) +- project_name.yaml - project config file that stores all project information + common samples metadata + +Storing common metadata in project file is an efficient way to reduce project size and complexity of csv files. +To specify and manage common metadata (where and how it should be stored) you can use next arguments: +`--const-limit-project`, `--const-limit-discard`, `--attr-limit-truncate` + +### Saving actual data: +Actual data will be saved if `--just-metadata` argument is not set. User should specify path to the folder where this +data should be downloaded. + +---- +Additionally, for each GSE input accession (ACC), `geofetch` produces (if discard-soft is not set): - GSE_ACC####.soft a SOFT file (annotating the experiment itself) - GSM_ACC####.soft a SOFT file (annotating the samples within the experiment) - SRA_ACC####.soft a CSV file (annotating each SRA Run, retrieved from GSE->GSM->SRA) -In addition, a single combined metadata file (.csv) for the whole input, -including SRA and GSM annotations for each sample. Here, "combined" means that it will have -rows for every sample in every GSE included in your input. So if you just gave a single GSE, -then the combined file is the same as the GSE file. If any "merged" samples exist -(samples for which there are multiple SRR Runs for a single SRX `Experiment`), the -script will also produce a merge table CSV file with the relationships between -SRX and SRR. - -The way this works: Starting from a GSE, select a subset of samples (GSM Accessions) provided, -and then obtain the SRX identifier for each of these from GEO. Now, query SRA for these SRX -accessions and get any associated SRR accessions. Finally, download all of these SRR data files. - -### The most important metadata in pep format will be stored in -- NAME_annotation_sample_processed.csv - all metadata for sample processed data -- NAME_annotation.csv - all metadata for series processed data -- NAME_annotation_series_processed.csv file - all metadata for raw data +____ +# geofetch - Geofetcher using Python + +user can use geofetch in Python without saving any files. All the geofetch projects will be automatically downloaded +as peppy Project. It helps save time and processing work. + +THe output in this case will be dictionary of projects: +```python +{'key1': (some_project), + 'key2': (second_project)} +``` + +More information you can find in tutorial files. \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index 89b3fad..29ba6b1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,19 +1,24 @@ -# - -Usage reference - -`geofetch` command-line usage instructions: +# Usage reference +geofetch command-line usage instructions: +`geofetch -V` +```console +geofetch 0.11.0 +``` `geofetch --help` -```{console} -usage: geofetch [-h] [-V] -i INPUT [-n NAME] [-m METADATA_ROOT] [-u METADATA_FOLDER] [--just-metadata] [-r] [--config-template CONFIG_TEMPLATE] - [--pipeline_samples PIPELINE_SAMPLES] [--pipeline_project PIPELINE_PROJECT] [-k SKIP] [--acc-anno] [--discard-soft] - [--const-limit-project CONST_LIMIT_PROJECT] [--const-limit-discard CONST_LIMIT_DISCARD] - [--attr-limit-truncate ATTR_LIMIT_TRUNCATE] [-p] [--data-source {all,samples,series}] [--filter FILTER] - [--filter-size FILTER_SIZE] [-g GEO_FOLDER] [-x] [-b BAM_FOLDER] [-f FQ_FOLDER] [--use-key-subset] [--silent] [--verbosity V] - [--logdev] +```console +usage: geofetch [-h] [-V] -i INPUT [-n NAME] [-m METADATA_ROOT] [-u METADATA_FOLDER] + [--just-metadata] [-r] [--config-template CONFIG_TEMPLATE] + [--pipeline-samples PIPELINE_SAMPLES] [--pipeline-project PIPELINE_PROJECT] + [--disable-progressbar] [-k SKIP] [--acc-anno] [--discard-soft] + [--const-limit-project CONST_LIMIT_PROJECT] + [--const-limit-discard CONST_LIMIT_DISCARD] + [--attr-limit-truncate ATTR_LIMIT_TRUNCATE] [--add-dotfile] [-p] + [--data-source {all,samples,series}] [--filter FILTER] + [--filter-size FILTER_SIZE] [-g GEO_FOLDER] [-x] [-b BAM_FOLDER] + [-f FQ_FOLDER] [--use-key-subset] [--silent] [--verbosity V] [--logdev] Automatic GEO and SRA data downloader @@ -21,34 +26,47 @@ optional arguments: -h, --help show this help message and exit -V, --version show program's version number and exit -i INPUT, --input INPUT - required: a GEO (GSE) accession, or a file with a list of GSE numbers + required: a GEO (GSE) accession, or a file with a list of GSE + numbers -n NAME, --name NAME Specify a project name. Defaults to GSE number -m METADATA_ROOT, --metadata-root METADATA_ROOT - Specify a parent folder location to store metadata. The project name will be added as a subfolder [Default: $SRAMETA:] + Specify a parent folder location to store metadata. The project name + will be added as a subfolder [Default: $SRAMETA:] -u METADATA_FOLDER, --metadata-folder METADATA_FOLDER - Specify an absolute folder location to store metadata. No subfolder will be added. Overrides value of --metadata-root - [Default: Not used (--metadata-root is used by default)] + Specify an absolute folder location to store metadata. No subfolder + will be added. Overrides value of --metadata-root [Default: Not used + (--metadata-root is used by default)] --just-metadata If set, don't actually run downloads, just create metadata -r, --refresh-metadata If set, re-download metadata even if it exists. --config-template CONFIG_TEMPLATE Project config yaml file template. - --pipeline_samples PIPELINE_SAMPLES - Optional: Specify one or more filepaths to SAMPLES pipeline interface yaml files. These will be added to the project - config file to make it immediately compatible with looper. [Default: null] - --pipeline_project PIPELINE_PROJECT - Optional: Specify one or more filepaths to PROJECT pipeline interface yaml files. These will be added to the project - config file to make it immediately compatible with looper. [Default: null] + --pipeline-samples PIPELINE_SAMPLES + Optional: Specify one or more filepaths to SAMPLES pipeline + interface yaml files. These will be added to the project config file + to make it immediately compatible with looper. [Default: null] + --pipeline-project PIPELINE_PROJECT + Optional: Specify one or more filepaths to PROJECT pipeline + interface yaml files. These will be added to the project config file + to make it immediately compatible with looper. [Default: null] + --disable-progressbar + Optional: Disable progressbar -k SKIP, --skip SKIP Skip some accessions. [Default: no skip]. - --acc-anno Optional: Produce annotation sheets for each accession. Project combined PEP for the whole project won't be produced. - --discard-soft Optional: After creation of PEP files, all soft and additional files will be deleted + --acc-anno Optional: Produce annotation sheets for each accession. Project + combined PEP for the whole project won't be produced. + --discard-soft Optional: After creation of PEP files, all soft and additional files + will be deleted --const-limit-project CONST_LIMIT_PROJECT - Optional: Limit of the number of the constant sample characters that should not be in project yaml. [Default: 50] + Optional: Limit of the number of the constant sample characters that + should not be in project yaml. [Default: 50] --const-limit-discard CONST_LIMIT_DISCARD - Optional: Limit of the number of the constant sample characters that should not be discarded [Default: 250] + Optional: Limit of the number of the constant sample characters that + should not be discarded [Default: 250] --attr-limit-truncate ATTR_LIMIT_TRUNCATE - Optional: Limit of the number of sample characters.Any attribute with more than X characters will truncate to the first - X, where X is a number of characters [Default: 500] + Optional: Limit of the number of sample characters.Any attribute + with more than X characters will truncate to the first X, where X is + a number of characters [Default: 500] + --add-dotfile Optional: Add .pep.yaml file that points .yaml PEP file --silent Silence logging. Overrides verbosity. --verbosity V Set logging level (1-5 or logging module level name) --logdev Expand content of logging message format. @@ -56,27 +74,35 @@ optional arguments: processed: -p, --processed Download processed data [Default: download raw data]. --data-source {all,samples,series} - Optional: Specifies the source of data on the GEO record to retrieve processed data, which may be attached to the - collective series entity, or to individual samples. Allowable values are: samples, series or both (all). Ignored unless - 'processed' flag is set. [Default: samples] - --filter FILTER Optional: Filter regex for processed filenames [Default: None].Ignored unless 'processed' flag is set. + Optional: Specifies the source of data on the GEO record to retrieve + processed data, which may be attached to the collective series + entity, or to individual samples. Allowable values are: samples, + series or both (all). Ignored unless 'processed' flag is set. + [Default: samples] + --filter FILTER Optional: Filter regex for processed filenames [Default: + None].Ignored unless 'processed' flag is set. --filter-size FILTER_SIZE - Optional: Filter size for processed files that are stored as sample repository [Default: None]. Works only for sample - data. Supported input formats : 12B, 12KB, 12MB, 12GB. Ignored unless 'processed' flag is set. + Optional: Filter size for processed files that are stored as sample + repository [Default: None]. Works only for sample data. Supported + input formats : 12B, 12KB, 12MB, 12GB. Ignored unless 'processed' + flag is set. -g GEO_FOLDER, --geo-folder GEO_FOLDER - Optional: Specify a location to store processed GEO files. Ignored unless 'processed' flag is set.[Default: $GEODATA:] + Optional: Specify a location to store processed GEO files. Ignored + unless 'processed' flag is set.[Default: $GEODATA:] raw: -x, --split-experiments - Split SRR runs into individual samples. By default, SRX experiments with multiple SRR Runs will have a single entry in - the annotation table, with each run as a separate row in the subannotation table. This setting instead treats each run as - a separate sample + Split SRR runs into individual samples. By default, SRX experiments + with multiple SRR Runs will have a single entry in the annotation + table, with each run as a separate row in the subannotation table. + This setting instead treats each run as a separate sample -b BAM_FOLDER, --bam-folder BAM_FOLDER - Optional: Specify folder of bam files. Geofetch will not download sra files when corresponding bam files already exist. - [Default: $SRABAM:] + Optional: Specify folder of bam files. Geofetch will not download + sra files when corresponding bam files already exist. [Default: + $SRABAM:] -f FQ_FOLDER, --fq-folder FQ_FOLDER - Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding fastq files already - exist. [Default: $SRAFQ:] + Optional: Specify folder of fastq files. Geofetch will not download + sra files when corresponding fastq files already exist. [Default: + $SRAFQ:] --use-key-subset Use just the keys defined in this module when writing out metadata. - ``` diff --git a/docs_jupyter/build/processed-data-downloading.md b/docs_jupyter/build/processed-data-downloading.md index 052d413..b851a61 100644 --- a/docs_jupyter/build/processed-data-downloading.md +++ b/docs_jupyter/build/processed-data-downloading.md @@ -15,126 +15,6 @@ geofetch 0.10.1 To see your CLI options, invoke `geofetch -h`: - -```bash -geofetch -h -``` - -```.output -usage: geofetch [-h] [-V] -i INPUT [-n NAME] [-m METADATA_ROOT] - [-u METADATA_FOLDER] [--just-metadata] [-r] - [--config-template CONFIG_TEMPLATE] - [--pipeline-samples PIPELINE_SAMPLES] - [--pipeline-project PIPELINE_PROJECT] [-k SKIP] [--acc-anno] - [--discard-soft] [--const-limit-project CONST_LIMIT_PROJECT] - [--const-limit-discard CONST_LIMIT_DISCARD] - [--attr-limit-truncate ATTR_LIMIT_TRUNCATE] [--add-dotfile] - [-p] [--data-source {all,samples,series}] [--filter FILTER] - [--filter-size FILTER_SIZE] [-g GEO_FOLDER] [-x] - [-b BAM_FOLDER] [-f FQ_FOLDER] [--use-key-subset] [--silent] - [--verbosity V] [--logdev] - -Automatic GEO and SRA data downloader - -optional arguments: - -h, --help show this help message and exit - -V, --version show program's version number and exit - -i INPUT, --input INPUT - required: a GEO (GSE) accession, or a file with a list - of GSE numbers - -n NAME, --name NAME Specify a project name. Defaults to GSE number - -m METADATA_ROOT, --metadata-root METADATA_ROOT - Specify a parent folder location to store metadata. - The project name will be added as a subfolder - [Default: $SRAMETA:] - -u METADATA_FOLDER, --metadata-folder METADATA_FOLDER - Specify an absolute folder location to store metadata. - No subfolder will be added. Overrides value of - --metadata-root [Default: Not used (--metadata-root is - used by default)] - --just-metadata If set, don't actually run downloads, just create - metadata - -r, --refresh-metadata - If set, re-download metadata even if it exists. - --config-template CONFIG_TEMPLATE - Project config yaml file template. - --pipeline-samples PIPELINE_SAMPLES - Optional: Specify one or more filepaths to SAMPLES - pipeline interface yaml files. These will be added to - the project config file to make it immediately - compatible with looper. [Default: null] - --pipeline-project PIPELINE_PROJECT - Optional: Specify one or more filepaths to PROJECT - pipeline interface yaml files. These will be added to - the project config file to make it immediately - compatible with looper. [Default: null] - -k SKIP, --skip SKIP Skip some accessions. [Default: no skip]. - --acc-anno Optional: Produce annotation sheets for each - accession. Project combined PEP for the whole project - won't be produced. - --discard-soft Optional: After creation of PEP files, all soft and - additional files will be deleted - --const-limit-project CONST_LIMIT_PROJECT - Optional: Limit of the number of the constant sample - characters that should not be in project yaml. - [Default: 50] - --const-limit-discard CONST_LIMIT_DISCARD - Optional: Limit of the number of the constant sample - characters that should not be discarded [Default: 250] - --attr-limit-truncate ATTR_LIMIT_TRUNCATE - Optional: Limit of the number of sample characters.Any - attribute with more than X characters will truncate to - the first X, where X is a number of characters - [Default: 500] - --add-dotfile Optional: Add .pep.yaml file that points .yaml PEP - file - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. - -processed: - -p, --processed Download processed data [Default: download raw data]. - --data-source {all,samples,series} - Optional: Specifies the source of data on the GEO - record to retrieve processed data, which may be - attached to the collective series entity, or to - individual samples. Allowable values are: samples, - series or both (all). Ignored unless 'processed' flag - is set. [Default: samples] - --filter FILTER Optional: Filter regex for processed filenames - [Default: None].Ignored unless 'processed' flag is - set. - --filter-size FILTER_SIZE - Optional: Filter size for processed files that are - stored as sample repository [Default: None]. Works - only for sample data. Supported input formats : 12B, - 12KB, 12MB, 12GB. Ignored unless 'processed' flag is - set. - -g GEO_FOLDER, --geo-folder GEO_FOLDER - Optional: Specify a location to store processed GEO - files. Ignored unless 'processed' flag is - set.[Default: $GEODATA:] - -raw: - -x, --split-experiments - Split SRR runs into individual samples. By default, - SRX experiments with multiple SRR Runs will have a - single entry in the annotation table, with each run as - a separate row in the subannotation table. This - setting instead treats each run as a separate sample - -b BAM_FOLDER, --bam-folder BAM_FOLDER - Optional: Specify folder of bam files. Geofetch will - not download sra files when corresponding bam files - already exist. [Default: $SRABAM:] - -f FQ_FOLDER, --fq-folder FQ_FOLDER - Optional: Specify folder of fastq files. Geofetch will - not download sra files when corresponding fastq files - already exist. [Default: $SRAFQ:] - --use-key-subset Use just the keys defined in this module when writing - out metadata. - -``` - Calling geofetch will do 4 tasks: 1. download all or filtered processed files from `GSE#####` into your geo folder. @@ -144,6 +24,11 @@ Calling geofetch will do 4 tasks: Complete details about geofetch outputs is cataloged in the [metadata outputs reference](metadata_output.md). +from IPython.core.display import SVG +SVG(filename='logo.svg') + +![arguments_outputs.svg](attachment:arguments_outputs.svg) + ## Download the data First, create the metadata for processed data (by adding --processed and --just-metadata): diff --git a/docs_jupyter/build/python-usage.md b/docs_jupyter/build/python-usage.md new file mode 100644 index 0000000..13e98c6 --- /dev/null +++ b/docs_jupyter/build/python-usage.md @@ -0,0 +1,360 @@ +jupyter:True +# Tutorial of usage geofetch as python package + +♪♫*•♪♪♫*•♪♪♫*•♪♪♫*•♪♪♫* + +Geofetch provides python fuctions to fetch metadata and metadata from GEO and SRA by using python language. `get_project` function returns dictionary of peppy projects that were found using filters and input you specified. + peppy is a Python package that provides an API for handling standardized project and sample metadata. + +More information you can get here: + +http://peppy.databio.org/en/latest/ + +http://pep.databio.org/en/2.0.0/ + +### First let's import geofetch + + +```python +from geofetch import Geofetcher +``` + +### Initiate Geofetch object by specifing parameters that you want to use for downloading metadata/data + +1) If you won't specify any parameters, defaul parameters will be used + + +```python +geof = Geofetcher() +``` + +```.output +Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name + +``` + +2) To download processed data with samples and series specify this two arguments: + + +```python +geof = Geofetcher(processed=True, data_source="all") +``` + +```.output +Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name + +``` + +3) To tune project parameter, where metadata should be stored use next parameters: + + +```python +geof = Geofetcher(processed=True, data_source="all", const_limit_project = 20, const_limit_discard = 500, attr_limit_truncate = 10000 ) +``` + +```.output +Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name + +``` + +4) To add more filter of other options see documentation + +## Run Geofetch + +### By default: +1) No actual data will be downloaded (just_metadata=True) + +2) No soft files will be saved on the disc (discard_soft=True) + + +```python +projects = geof.get_projects("GSE95654") +``` + +```.output +Trying GSE95654 (not a file) as accession... +Trying GSE95654 (not a file) as accession... + +``` + + + Output() + + +```.output +Skipped 0 accessions. Starting now. +Processing accession 1 of 1: 'GSE95654' + +Total number of processed SAMPLES files found is: 40 +Total number of processed SERIES files found is: 0 +Expanding metadata list... +Expanding metadata list... + +``` + + +

+
+
+
+
+
+
+ + + +```.output +Finished processing 1 accession(s) +Cleaning soft files ... +Unifying and saving of metadata... + +``` + + + Output() + + + +

+
+
+
+
+
+
+ + + + +
+
+ + + + + Output() + + + +

+
+
+
+
+
+
+ + + + +
+
+ + + +```.output +No files found. No data to save. File /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name/GSE95654_series/GSE95654_series.csv won't be created + +``` + +Check if projects were created by checking dict keys: + + +```python +projects.keys() +``` + + + + + dict_keys(['GSE95654_samples']) + + + +project for smaples was created! Now let's look into it. + +\* the values of the dictionary are peppy projects. More information about peppy Project you can find in the documentation: http://peppy.databio.org/en/latest/ + + +```python +len(projects['GSE95654_samples'].samples) +``` + + + + + 40 + + + +We got 40 samples from GSE95654 project. If you want to check if it's correct information go into: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE95654 + +Now let's see actuall data. first 15 project and 5 clolumns: + + +```python +projects['GSE95654_samples'].sample_table.iloc[:15 , :5] +``` + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
sample_namesample_library_strategygenome_buildtissuesample_organism_ch1
sample_name
RRBS_on_CRC_patient_8RRBS_on_CRC_patient_8Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_8RRBS_on_adjacent_normal_colon_patient_8Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_32RRBS_on_CRC_patient_32Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_32RRBS_on_adjacent_normal_colon_patient_32Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_41RRBS_on_CRC_patient_41Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_41RRBS_on_adjacent_normal_colon_patient_41Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_42RRBS_on_CRC_patient_42Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_42RRBS_on_adjacent_normal_colon_patient_42Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_ACF_patient_173RRBS_on_ACF_patient_173Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_ACF_patient_515RRBS_on_ACF_patient_515Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_139RRBS_on_normal_crypts_patient_139Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_143RRBS_on_ACF_patient_143Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_143RRBS_on_normal_crypts_patient_143Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_normal_crypts_patient_165RRBS_on_normal_crypts_patient_165Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_165RRBS_on_ACF_patient_165Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
+
+ + diff --git a/docs_jupyter/processed-data-downloading.ipynb b/docs_jupyter/processed-data-downloading.ipynb index c691aea..b386a74 100644 --- a/docs_jupyter/processed-data-downloading.ipynb +++ b/docs_jupyter/processed-data-downloading.ipynb @@ -33,135 +33,6 @@ "To see your CLI options, invoke `geofetch -h`:" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: geofetch [-h] [-V] -i INPUT [-n NAME] [-m METADATA_ROOT]\n", - " [-u METADATA_FOLDER] [--just-metadata] [-r]\n", - " [--config-template CONFIG_TEMPLATE]\n", - " [--pipeline-samples PIPELINE_SAMPLES]\n", - " [--pipeline-project PIPELINE_PROJECT] [-k SKIP] [--acc-anno]\n", - " [--discard-soft] [--const-limit-project CONST_LIMIT_PROJECT]\n", - " [--const-limit-discard CONST_LIMIT_DISCARD]\n", - " [--attr-limit-truncate ATTR_LIMIT_TRUNCATE] [--add-dotfile]\n", - " [-p] [--data-source {all,samples,series}] [--filter FILTER]\n", - " [--filter-size FILTER_SIZE] [-g GEO_FOLDER] [-x]\n", - " [-b BAM_FOLDER] [-f FQ_FOLDER] [--use-key-subset] [--silent]\n", - " [--verbosity V] [--logdev]\n", - "\n", - "Automatic GEO and SRA data downloader\n", - "\n", - "optional arguments:\n", - " -h, --help show this help message and exit\n", - " -V, --version show program's version number and exit\n", - " -i INPUT, --input INPUT\n", - " required: a GEO (GSE) accession, or a file with a list\n", - " of GSE numbers\n", - " -n NAME, --name NAME Specify a project name. Defaults to GSE number\n", - " -m METADATA_ROOT, --metadata-root METADATA_ROOT\n", - " Specify a parent folder location to store metadata.\n", - " The project name will be added as a subfolder\n", - " [Default: $SRAMETA:]\n", - " -u METADATA_FOLDER, --metadata-folder METADATA_FOLDER\n", - " Specify an absolute folder location to store metadata.\n", - " No subfolder will be added. Overrides value of\n", - " --metadata-root [Default: Not used (--metadata-root is\n", - " used by default)]\n", - " --just-metadata If set, don't actually run downloads, just create\n", - " metadata\n", - " -r, --refresh-metadata\n", - " If set, re-download metadata even if it exists.\n", - " --config-template CONFIG_TEMPLATE\n", - " Project config yaml file template.\n", - " --pipeline-samples PIPELINE_SAMPLES\n", - " Optional: Specify one or more filepaths to SAMPLES\n", - " pipeline interface yaml files. These will be added to\n", - " the project config file to make it immediately\n", - " compatible with looper. [Default: null]\n", - " --pipeline-project PIPELINE_PROJECT\n", - " Optional: Specify one or more filepaths to PROJECT\n", - " pipeline interface yaml files. These will be added to\n", - " the project config file to make it immediately\n", - " compatible with looper. [Default: null]\n", - " -k SKIP, --skip SKIP Skip some accessions. [Default: no skip].\n", - " --acc-anno Optional: Produce annotation sheets for each\n", - " accession. Project combined PEP for the whole project\n", - " won't be produced.\n", - " --discard-soft Optional: After creation of PEP files, all soft and\n", - " additional files will be deleted\n", - " --const-limit-project CONST_LIMIT_PROJECT\n", - " Optional: Limit of the number of the constant sample\n", - " characters that should not be in project yaml.\n", - " [Default: 50]\n", - " --const-limit-discard CONST_LIMIT_DISCARD\n", - " Optional: Limit of the number of the constant sample\n", - " characters that should not be discarded [Default: 250]\n", - " --attr-limit-truncate ATTR_LIMIT_TRUNCATE\n", - " Optional: Limit of the number of sample characters.Any\n", - " attribute with more than X characters will truncate to\n", - " the first X, where X is a number of characters\n", - " [Default: 500]\n", - " --add-dotfile Optional: Add .pep.yaml file that points .yaml PEP\n", - " file\n", - " --silent Silence logging. Overrides verbosity.\n", - " --verbosity V Set logging level (1-5 or logging module level name)\n", - " --logdev Expand content of logging message format.\n", - "\n", - "processed:\n", - " -p, --processed Download processed data [Default: download raw data].\n", - " --data-source {all,samples,series}\n", - " Optional: Specifies the source of data on the GEO\n", - " record to retrieve processed data, which may be\n", - " attached to the collective series entity, or to\n", - " individual samples. Allowable values are: samples,\n", - " series or both (all). Ignored unless 'processed' flag\n", - " is set. [Default: samples]\n", - " --filter FILTER Optional: Filter regex for processed filenames\n", - " [Default: None].Ignored unless 'processed' flag is\n", - " set.\n", - " --filter-size FILTER_SIZE\n", - " Optional: Filter size for processed files that are\n", - " stored as sample repository [Default: None]. Works\n", - " only for sample data. Supported input formats : 12B,\n", - " 12KB, 12MB, 12GB. Ignored unless 'processed' flag is\n", - " set.\n", - " -g GEO_FOLDER, --geo-folder GEO_FOLDER\n", - " Optional: Specify a location to store processed GEO\n", - " files. Ignored unless 'processed' flag is\n", - " set.[Default: $GEODATA:]\n", - "\n", - "raw:\n", - " -x, --split-experiments\n", - " Split SRR runs into individual samples. By default,\n", - " SRX experiments with multiple SRR Runs will have a\n", - " single entry in the annotation table, with each run as\n", - " a separate row in the subannotation table. This\n", - " setting instead treats each run as a separate sample\n", - " -b BAM_FOLDER, --bam-folder BAM_FOLDER\n", - " Optional: Specify folder of bam files. Geofetch will\n", - " not download sra files when corresponding bam files\n", - " already exist. [Default: $SRABAM:]\n", - " -f FQ_FOLDER, --fq-folder FQ_FOLDER\n", - " Optional: Specify folder of fastq files. Geofetch will\n", - " not download sra files when corresponding fastq files\n", - " already exist. [Default: $SRAFQ:]\n", - " --use-key-subset Use just the keys defined in this module when writing\n", - " out metadata.\n" - ] - } - ], - "source": [ - "geofetch -h" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -176,6 +47,29 @@ "Complete details about geofetch outputs is cataloged in the [metadata outputs reference](metadata_output.md)." ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from IPython.core.display import SVG\n", + "SVG(filename='logo.svg')" + ] + }, + { + "attachments": { + "arguments_outputs.svg": { + "image/svg+xml": [ + "" + ] + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![arguments_outputs.svg](attachment:arguments_outputs.svg)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/docs_jupyter/python-usage.ipynb b/docs_jupyter/python-usage.ipynb new file mode 100644 index 0000000..9b34736 --- /dev/null +++ b/docs_jupyter/python-usage.ipynb @@ -0,0 +1,712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "67fc2596", + "metadata": {}, + "source": [ + "# Tutorial of usage geofetch as python package" + ] + }, + { + "cell_type": "markdown", + "id": "3ced4b1e", + "metadata": {}, + "source": [ + "♪♫*•♪♪♫*•♪♪♫*•♪♪♫*•♪♪♫*" + ] + }, + { + "cell_type": "markdown", + "id": "0093b8ef", + "metadata": {}, + "source": [ + "Geofetch provides python fuctions to fetch metadata and metadata from GEO and SRA by using python language. `get_project` function returns dictionary of peppy projects that were found using filters and input you specified.\n", + " peppy is a Python package that provides an API for handling standardized project and sample metadata. \n", + " \n", + "More information you can get here:\n", + " \n", + "http://peppy.databio.org/en/latest/\n", + "\n", + "http://pep.databio.org/en/2.0.0/" + ] + }, + { + "cell_type": "markdown", + "id": "64746e18", + "metadata": {}, + "source": [ + "### First let's import geofetch" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "60b65668", + "metadata": {}, + "outputs": [], + "source": [ + "from geofetch import Geofetcher" + ] + }, + { + "cell_type": "markdown", + "id": "b6edbdd7", + "metadata": {}, + "source": [ + "### Initiate Geofetch object by specifing parameters that you want to use for downloading metadata/data" + ] + }, + { + "cell_type": "markdown", + "id": "dc107c16", + "metadata": {}, + "source": [ + "1) If you won't specify any parameters, defaul parameters will be used" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "af268078", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name\n" + ] + } + ], + "source": [ + "geof = Geofetcher()" + ] + }, + { + "cell_type": "markdown", + "id": "1916922e", + "metadata": {}, + "source": [ + "2) To download processed data with samples and series specify this two arguments:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d451856a", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name\n" + ] + } + ], + "source": [ + "geof = Geofetcher(processed=True, data_source=\"all\")" + ] + }, + { + "cell_type": "markdown", + "id": "8debdd11", + "metadata": {}, + "source": [ + "3) To tune project parameter, where metadata should be stored use next parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f8edb462", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name\n" + ] + } + ], + "source": [ + "geof = Geofetcher(processed=True, data_source=\"all\", const_limit_project = 20, const_limit_discard = 500, attr_limit_truncate = 10000 )" + ] + }, + { + "cell_type": "markdown", + "id": "d2739b13", + "metadata": {}, + "source": [ + "4) To add more filter of other options see documentation" + ] + }, + { + "cell_type": "markdown", + "id": "00b66d4a", + "metadata": {}, + "source": [ + "## Run Geofetch" + ] + }, + { + "cell_type": "markdown", + "id": "5e6c5df8", + "metadata": {}, + "source": [ + "### By default: \n", + "1) No actual data will be downloaded (just_metadata=True)\n", + "\n", + "2) No soft files will be saved on the disc (discard_soft=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "12d70387", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Trying GSE95654 (not a file) as accession...\n", + "Trying GSE95654 (not a file) as accession...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0f96c1a1ee8c48f4af31e0dc939fe116", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Skipped 0 accessions. Starting now.\n", + "\u001b[38;5;200mProcessing accession 1 of 1: 'GSE95654'\u001b[0m\n", + "\n", + "Total number of processed SAMPLES files found is: 40\n", + "Total number of processed SERIES files found is: 0\n", + "Expanding metadata list...\n", + "Expanding metadata list...\n" + ] + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Finished processing 1 accession(s)\n", + "Cleaning soft files ...\n", + "Unifying and saving of metadata... \n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b2c4c738728b4b43938fa6e7f29615ef", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "02401b3d938a4a588052ba99af677f84", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No files found. No data to save. File /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name/GSE95654_series/GSE95654_series.csv won't be created\n" + ] + } + ], + "source": [ + "projects = geof.get_projects(\"GSE95654\")" + ] + }, + { + "cell_type": "markdown", + "id": "bc198009", + "metadata": {}, + "source": [ + "Check if projects were created by checking dict keys:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "95896f25", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['GSE95654_samples'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "projects.keys()" + ] + }, + { + "cell_type": "markdown", + "id": "4e27f971", + "metadata": {}, + "source": [ + "project for smaples was created! Now let's look into it." + ] + }, + { + "cell_type": "markdown", + "id": "fa2d0bda", + "metadata": {}, + "source": [ + "\\* the values of the dictionary are peppy projects. More information about peppy Project you can find in the documentation: http://peppy.databio.org/en/latest/" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e8642711", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "40" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(projects['GSE95654_samples'].samples)" + ] + }, + { + "cell_type": "markdown", + "id": "a4d50082", + "metadata": {}, + "source": [ + "We got 40 samples from GSE95654 project. If you want to check if it's correct information go into: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE95654" + ] + }, + { + "cell_type": "markdown", + "id": "d0cd958a", + "metadata": {}, + "source": [ + "Now let's see actuall data. first 15 project and 5 clolumns:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ba7be762", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_namesample_library_strategygenome_buildtissuesample_organism_ch1
sample_name
RRBS_on_CRC_patient_8RRBS_on_CRC_patient_8Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_8RRBS_on_adjacent_normal_colon_patient_8Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_32RRBS_on_CRC_patient_32Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_32RRBS_on_adjacent_normal_colon_patient_32Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_41RRBS_on_CRC_patient_41Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_41RRBS_on_adjacent_normal_colon_patient_41Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_42RRBS_on_CRC_patient_42Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_42RRBS_on_adjacent_normal_colon_patient_42Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_ACF_patient_173RRBS_on_ACF_patient_173Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_ACF_patient_515RRBS_on_ACF_patient_515Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_139RRBS_on_normal_crypts_patient_139Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_143RRBS_on_ACF_patient_143Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_143RRBS_on_normal_crypts_patient_143Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_normal_crypts_patient_165RRBS_on_normal_crypts_patient_165Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_165RRBS_on_ACF_patient_165Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
\n", + "
" + ], + "text/plain": [ + " sample_name \\\n", + "sample_name \n", + "RRBS_on_CRC_patient_8 RRBS_on_CRC_patient_8 \n", + "RRBS_on_adjacent_normal_colon_patient_8 RRBS_on_adjacent_normal_colon_patient_8 \n", + "RRBS_on_CRC_patient_32 RRBS_on_CRC_patient_32 \n", + "RRBS_on_adjacent_normal_colon_patient_32 RRBS_on_adjacent_normal_colon_patient_32 \n", + "RRBS_on_CRC_patient_41 RRBS_on_CRC_patient_41 \n", + "RRBS_on_adjacent_normal_colon_patient_41 RRBS_on_adjacent_normal_colon_patient_41 \n", + "RRBS_on_CRC_patient_42 RRBS_on_CRC_patient_42 \n", + "RRBS_on_adjacent_normal_colon_patient_42 RRBS_on_adjacent_normal_colon_patient_42 \n", + "RRBS_on_ACF_patient_173 RRBS_on_ACF_patient_173 \n", + "RRBS_on_ACF_patient_515 RRBS_on_ACF_patient_515 \n", + "RRBS_on_normal_crypts_patient_139 RRBS_on_normal_crypts_patient_139 \n", + "RRBS_on_ACF_patient_143 RRBS_on_ACF_patient_143 \n", + "RRBS_on_normal_crypts_patient_143 RRBS_on_normal_crypts_patient_143 \n", + "RRBS_on_normal_crypts_patient_165 RRBS_on_normal_crypts_patient_165 \n", + "RRBS_on_ACF_patient_165 RRBS_on_ACF_patient_165 \n", + "\n", + " sample_library_strategy genome_build \\\n", + "sample_name \n", + "RRBS_on_CRC_patient_8 Bisulfite-Seq hg19 \n", + "RRBS_on_adjacent_normal_colon_patient_8 Bisulfite-Seq hg19 \n", + "RRBS_on_CRC_patient_32 Bisulfite-Seq hg19 \n", + "RRBS_on_adjacent_normal_colon_patient_32 Bisulfite-Seq hg19 \n", + "RRBS_on_CRC_patient_41 Bisulfite-Seq hg19 \n", + "RRBS_on_adjacent_normal_colon_patient_41 Bisulfite-Seq hg19 \n", + "RRBS_on_CRC_patient_42 Bisulfite-Seq hg19 \n", + "RRBS_on_adjacent_normal_colon_patient_42 Bisulfite-Seq hg19 \n", + "RRBS_on_ACF_patient_173 Bisulfite-Seq hg19 \n", + "RRBS_on_ACF_patient_515 Bisulfite-Seq hg19 \n", + "RRBS_on_normal_crypts_patient_139 Bisulfite-Seq hg19 \n", + "RRBS_on_ACF_patient_143 Bisulfite-Seq hg19 \n", + "RRBS_on_normal_crypts_patient_143 Bisulfite-Seq hg19 \n", + "RRBS_on_normal_crypts_patient_165 Bisulfite-Seq hg19 \n", + "RRBS_on_ACF_patient_165 Bisulfite-Seq hg19 \n", + "\n", + " tissue \\\n", + "sample_name \n", + "RRBS_on_CRC_patient_8 primary tumor \n", + "RRBS_on_adjacent_normal_colon_patient_8 adjacent normal colon \n", + "RRBS_on_CRC_patient_32 primary tumor \n", + "RRBS_on_adjacent_normal_colon_patient_32 adjacent normal colon \n", + "RRBS_on_CRC_patient_41 primary tumor \n", + "RRBS_on_adjacent_normal_colon_patient_41 adjacent normal colon \n", + "RRBS_on_CRC_patient_42 primary tumor \n", + "RRBS_on_adjacent_normal_colon_patient_42 adjacent normal colon \n", + "RRBS_on_ACF_patient_173 aberrant crypt foci \n", + "RRBS_on_ACF_patient_515 aberrant crypt foci \n", + "RRBS_on_normal_crypts_patient_139 normal colonic crypt \n", + "RRBS_on_ACF_patient_143 aberrant crypt foci \n", + "RRBS_on_normal_crypts_patient_143 normal colonic crypt \n", + "RRBS_on_normal_crypts_patient_165 normal colonic crypt \n", + "RRBS_on_ACF_patient_165 aberrant crypt foci \n", + "\n", + " sample_organism_ch1 \n", + "sample_name \n", + "RRBS_on_CRC_patient_8 Homo sapiens \n", + "RRBS_on_adjacent_normal_colon_patient_8 Homo sapiens \n", + "RRBS_on_CRC_patient_32 Homo sapiens \n", + "RRBS_on_adjacent_normal_colon_patient_32 Homo sapiens \n", + "RRBS_on_CRC_patient_41 Homo sapiens \n", + "RRBS_on_adjacent_normal_colon_patient_41 Homo sapiens \n", + "RRBS_on_CRC_patient_42 Homo sapiens \n", + "RRBS_on_adjacent_normal_colon_patient_42 Homo sapiens \n", + "RRBS_on_ACF_patient_173 Homo sapiens \n", + "RRBS_on_ACF_patient_515 Homo sapiens \n", + "RRBS_on_normal_crypts_patient_139 Homo sapiens \n", + "RRBS_on_ACF_patient_143 Homo sapiens \n", + "RRBS_on_normal_crypts_patient_143 Homo sapiens \n", + "RRBS_on_normal_crypts_patient_165 Homo sapiens \n", + "RRBS_on_ACF_patient_165 Homo sapiens " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "projects['GSE95654_samples'].sample_table.iloc[:15 , :5]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/geofetch/__init__.py b/geofetch/__init__.py index 003168a..ef3887f 100644 --- a/geofetch/__init__.py +++ b/geofetch/__init__.py @@ -1,5 +1,6 @@ """ Package-level data """ from .geofetch import * +from .finder import * from ._version import __version__ import logmuse diff --git a/geofetch/_version.py b/geofetch/_version.py index 1f4c4d4..ae6db5f 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.10.1" +__version__ = "0.11.0" diff --git a/geofetch/cli.py b/geofetch/cli.py new file mode 100644 index 0000000..1d5bcce --- /dev/null +++ b/geofetch/cli.py @@ -0,0 +1,286 @@ +import argparse +import os +import logmuse +from ._version import __version__ + + +def _safe_echo(var): + """Returns an environment variable if it exists, or an empty string if not""" + return os.getenv(var, "") + + +def _parse_cmdl(cmdl): + """ + parser + """ + parser = argparse.ArgumentParser( + description="Automatic GEO and SRA data downloader", + usage="""geofetch [] + +The example how to use geofetch (to download GSE573030 just metadata): + geofetch -i GSE67303 -m `pwd` --just-metadata + +To download all processed data of GSE57303: + geofetch -i GSE67303 --processed --geo-folder `pwd` -m `pwd` + +* where `pwd` is a current directory + +""", + ) + + processed_group = parser.add_argument_group("processed") + raw_group = parser.add_argument_group("raw") + + parser.add_argument( + "-V", "--version", action="version", version=f"%(prog)s {__version__}" + ) + + # Required + parser.add_argument( + "-i", + "--input", + dest="input", + required=True, + help="required: a GEO (GSE) accession, or a file with a list of GSE numbers", + ) + + # Optional + parser.add_argument( + "-n", "--name", help="Specify a project name. Defaults to GSE number" + ) + + parser.add_argument( + "-m", + "--metadata-root", + dest="metadata_root", + default=_safe_echo("SRAMETA"), + help="Specify a parent folder location to store metadata. " + "The project name will be added as a subfolder " + "[Default: $SRAMETA:" + _safe_echo("SRAMETA") + "]", + ) + + parser.add_argument( + "-u", + "--metadata-folder", + help="Specify an absolute folder location to store metadata. " + "No subfolder will be added. Overrides value of --metadata-root " + "[Default: Not used (--metadata-root is used by default)]", + ) + + parser.add_argument( + "--just-metadata", + action="store_true", + help="If set, don't actually run downloads, just create metadata", + ) + + parser.add_argument( + "-r", + "--refresh-metadata", + action="store_true", + help="If set, re-download metadata even if it exists.", + ) + + parser.add_argument( + "--config-template", default=None, help="Project config yaml file template." + ) + + # Optional + parser.add_argument( + "--pipeline-samples", + default=None, + help="Optional: Specify one or more filepaths to SAMPLES pipeline interface yaml files. " + "These will be added to the project config file to make it immediately " + "compatible with looper. [Default: null]", + ) + + # Optional + parser.add_argument( + "--pipeline-project", + default=None, + help="Optional: Specify one or more filepaths to PROJECT pipeline interface yaml files. " + "These will be added to the project config file to make it immediately " + "compatible with looper. [Default: null]", + ) + # Optional + parser.add_argument( + "--disable-progressbar", + action="store_true", + help="Optional: Disable progressbar", + ) + + # Optional + parser.add_argument( + "-k", + "--skip", + default=0, + type=int, + help="Skip some accessions. [Default: no skip].", + ) + + parser.add_argument( + "--acc-anno", + action="store_true", + help="Optional: Produce annotation sheets for each accession." + " Project combined PEP for the whole project won't be produced.", + ) + + parser.add_argument( + "--discard-soft", + action="store_true", + help="Optional: After creation of PEP files, all soft and additional files will be deleted", + ) + + parser.add_argument( + "--const-limit-project", + type=int, + default=50, + help="Optional: Limit of the number of the constant sample characters " + "that should not be in project yaml. [Default: 50]", + ) + + parser.add_argument( + "--const-limit-discard", + type=int, + default=250, + help="Optional: Limit of the number of the constant sample characters " + "that should not be discarded [Default: 250]", + ) + + parser.add_argument( + "--attr-limit-truncate", + type=int, + default=500, + help="Optional: Limit of the number of sample characters." + "Any attribute with more than X characters will truncate to the first X," + " where X is a number of characters [Default: 500]", + ) + + parser.add_argument( + "--add-dotfile", + action="store_true", + help="Optional: Add .pep.yaml file that points .yaml PEP file", + ) + + processed_group.add_argument( + "-p", + "--processed", + default=False, + action="store_true", + help="Download processed data [Default: download raw data].", + ) + + processed_group.add_argument( + "--data-source", + dest="data_source", + choices=["all", "samples", "series"], + default="samples", + help="Optional: Specifies the source of data on the GEO record" + " to retrieve processed data, which may be attached to the" + " collective series entity, or to individual samples. " + "Allowable values are: samples, series or both (all). " + "Ignored unless 'processed' flag is set. [Default: samples]", + ) + + processed_group.add_argument( + "--filter", + default=None, + help="Optional: Filter regex for processed filenames [Default: None]." + "Ignored unless 'processed' flag is set.", + ) + + processed_group.add_argument( + "--filter-size", + dest="filter_size", + default=None, + help="""Optional: Filter size for processed files + that are stored as sample repository [Default: None]. + Works only for sample data. + Supported input formats : 12B, 12KB, 12MB, 12GB. + Ignored unless 'processed' flag is set.""", + ) + + processed_group.add_argument( + "-g", + "--geo-folder", + default=_safe_echo("GEODATA"), + help="Optional: Specify a location to store processed GEO files." + " Ignored unless 'processed' flag is set." + "[Default: $GEODATA:" + _safe_echo("GEODATA") + "]", + ) + + raw_group.add_argument( + "-x", + "--split-experiments", + action="store_true", + help="""Split SRR runs into individual samples. By default, SRX + experiments with multiple SRR Runs will have a single entry in the + annotation table, with each run as a separate row in the + subannotation table. This setting instead treats each run as a + separate sample""", + ) + + raw_group.add_argument( + "-b", + "--bam-folder", + dest="bam_folder", + default=_safe_echo("SRABAM"), + help="""Optional: Specify folder of bam files. Geofetch will not + download sra files when corresponding bam files already exist. + [Default: $SRABAM:""" + + _safe_echo("SRABAM") + + "]", + ) + + raw_group.add_argument( + "-f", + "--fq-folder", + dest="fq_folder", + default=_safe_echo("SRAFQ"), + help="""Optional: Specify folder of fastq files. Geofetch will not + download sra files when corresponding fastq files already exist. + [Default: $SRAFQ:""" + + _safe_echo("SRAFQ") + + "]", + ) + + # Deprecated; these are for bam conversion which now happens in sra_convert + # it still works here but I hide it so people don't use it, because it's confusing. + raw_group.add_argument( + "-s", + "--sra-folder", + dest="sra_folder", + default=_safe_echo("SRARAW"), + help=argparse.SUPPRESS, + # help="Optional: Specify a location to store sra files " + # "[Default: $SRARAW:" + safe_echo("SRARAW") + "]" + ) + raw_group.add_argument( + "--bam-conversion", + action="store_true", + # help="Turn on sequential bam conversion. Default: No conversion.", + help=argparse.SUPPRESS, + ) + + raw_group.add_argument( + "--picard-path", + dest="picard_path", + default=_safe_echo("PICARD"), + # help="Specify a path to the picard jar, if you want to convert " + # "fastq to bam [Default: $PICARD:" + safe_echo("PICARD") + "]", + help=argparse.SUPPRESS, + ) + + raw_group.add_argument( + "--use-key-subset", + action="store_true", + help="Use just the keys defined in this module when writing out metadata.", + ) + + raw_group.add_argument( + "--add-convert-modifier", + action="store_true", + help="Add looper SRA convert modifier to config file.", + ) + + logmuse.add_logging_options(parser) + return parser.parse_args(cmdl) diff --git a/geofetch/config_processed_template.yaml b/geofetch/config_processed_template.yaml index 66690c9..1198863 100644 --- a/geofetch/config_processed_template.yaml +++ b/geofetch/config_processed_template.yaml @@ -7,8 +7,12 @@ sample_table: {sample_table} sample_modifiers: append: output_file_path: FILES + # Project metadata: {additional_columns} + # End of project metadata {pipeline_samples} + + # adding file paths to the project derive: attributes: [output_file_path] sources: diff --git a/geofetch/config_template.yaml b/geofetch/config_template.yaml index b492ae6..588d81a 100644 --- a/geofetch/config_template.yaml +++ b/geofetch/config_template.yaml @@ -3,58 +3,14 @@ name: {project_name} pep_version: 2.1.0 sample_table: {annotation} -subsample_table: {subannotation} +{subannotation} -looper: - output_dir: {project_name} - pipeline_interfaces: {pipeline_interfaces} - -sample_modifiers: - append: +{sample_modifier_str} + # Project metadata: {additional_columns} - SRR_files: SRA + # End of project metadata {pipeline_samples} - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRABAM}/{SRR}.bam" - FQ: "${SRAFQ}/{SRR}.fastq.gz" - FQ1: "${SRAFQ}/{SRR}_1.fastq.gz" - FQ2: "${SRAFQ}/{SRR}_2.fastq.gz" - imply: - - if: - organism: "Mus musculus" - then: - genome: mm10 - - if: - organism: "Homo sapiens" - then: - genome: hg38 - - if: - read_type: "PAIRED" - then: - read1: FQ1 - read2: FQ2 - - if: - read_type: "SINGLE" - then: - read1: FQ1 -project_modifiers: - amend: - sra_convert: - looper: - results_subdir: sra_convert_results - sample_modifiers: - append: - SRR_files: SRA - pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRARAW}/{SRR}.sra" - FQ: "${SRAFQ}/{SRR}.fastq.gz" - FQ1: "${SRAFQ}/{SRR}_1.fastq.gz" - FQ2: "${SRAFQ}/{SRR}_2.fastq.gz" +{sra_convert} {pipeline_project} diff --git a/geofetch/const.py b/geofetch/const.py new file mode 100644 index 0000000..2267223 --- /dev/null +++ b/geofetch/const.py @@ -0,0 +1,65 @@ +import re + +_LOGGER = None + +# A set of hard-coded keys if you want to limit to just a few instead of taking +# all information provided in GEO. Use with `--use-key-subset` +ANNOTATION_SHEET_KEYS = [ + "sample_name", + "protocol", + "read_type", + "organism", + "data_source", + "Sample_title", + "Sample_source_name_ch1", + "Sample_organism_ch1", + "Sample_library_selection", + "Sample_library_strategy", + "Sample_type", + "SRR", + "SRX", + "Sample_geo_accession", + "Sample_series_id", + "Sample_instrument_model", +] + +# Regex to parse out SRA accession identifiers +PROJECT_PATTERN = re.compile(r"(SRP\d{4,8})") +EXPERIMENT_PATTERN = re.compile(r"(SRX\d{4,8})") +GSE_PATTERN = re.compile(r"(GSE\d{4,8})") +SUPP_FILE_PATTERN = re.compile("Sample_supplementary_file") +SER_SUPP_FILE_PATTERN = re.compile("Series_supplementary_file") + +SAMPLE_SUPP_METADATA_FILE = "_samples.csv" +EXP_SUPP_METADATA_FILE = "_series.csv" +FILE_RAW_NAME_SAMPLE_PATTERN = "_raw.csv" +FILE_RAW_NAME_SUBSAMPLE_PATTERN = "_raw_subtable.csv" + +# How many times should we retry failing prefetch call? +NUM_RETRIES = 3 +REQUEST_SLEEP = 0.4 + +NCBI_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={SRP_NUMBER}&retmax=999&rettype=uilist&retmode=json" +NCBI_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={ID}&rettype=runinfo&retmode=xml" + +NEW_GENOME_COL_NAME = "ref_genome" + +CONFIG_PROCESSED_TEMPLATE_NAME = "config_processed_template.yaml" +CONFIG_RAW_TEMPLATE_NAME = "config_template.yaml" +CONFIG_SRA_TEMPLATE = "looper_sra_convert.yaml" + +# const for Finder: +RETMAX = 10000000 # once it should be increased + +# gds = geo DataSets +ETOOLS_GEO_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds" +ETOOLS_GEO_GSE_BASE = f"{ETOOLS_GEO_BASE}&term=GSE[ETYP]" + +ETOOLS_ENDING = "&retmax={retmax}&usehistory=y" + +TODAY_DATE = "3000" + +DATE_FILTER = ( + '+AND+("{start_date}"[Publication%20Date]%20:%20"{end_date}"[Publication%20Date])' +) +THREE_MONTH_FILTER = '+AND+"published+last+3+months"[Filter]' diff --git a/geofetch/finder.py b/geofetch/finder.py new file mode 100644 index 0000000..1882e11 --- /dev/null +++ b/geofetch/finder.py @@ -0,0 +1,179 @@ +from .const import ( + RETMAX, + ETOOLS_GEO_GSE_BASE, + ETOOLS_ENDING, + TODAY_DATE, + DATE_FILTER, + THREE_MONTH_FILTER, +) +import requests +import xmltodict +import re +import os +import logmuse +import coloredlogs +from datetime import datetime +from datetime import timedelta + +__author__ = "Oleksandr Khoroshevskyi" + +_LOGGER = logmuse.init_logger("pepannot") +coloredlogs.install( + logger=_LOGGER, + datefmt="%H:%M:%S", + fmt="[%(levelname)s] [%(asctime)s] %(message)s", +) + + +class Finder: + """ + Class for finding GSE accessions in special period of time. + Additionally, user can add specific filters for the search, + while initialization of the class + """ + + def __init__(self, filters: str = None, retmax: int = RETMAX): + """ + :param filters: filters that have to be added to the query. + Filter Patterns can be found here: + https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag + :param retmax: maximum number of retrieved accessions. + """ + self.query_customized_ending = ETOOLS_ENDING.format(retmax=retmax) + self.query_filter_str = self._create_filter_str(filters) + self.last_result = [] + + def get_gse_all(self) -> list: + """ + Get list of all gse accession available in GEO + :return: list of gse accession + """ + return self.get_gse_id_by_query(url=self._compose_url()) + + def get_gse_last_3_month(self) -> list: + """ + Get list of gse accession that were uploaded or updated in last 3 month + :return: list of gse accession + """ + return self.get_gse_id_by_query(url=self._compose_url(THREE_MONTH_FILTER)) + + def get_gse_last_week(self) -> list: + """ + Get list of gse accession that were uploaded or updated in last week + :return: list of gse accession + """ + return self.get_gse_by_day_count(7) + + def get_gse_by_day_count(self, n_days: int = 1) -> list: + """ + Get list of gse accessions that were uploaded or updated in last X days + :param n_days: number of days from now [e.g. 5] + :return: list of gse accession + """ + today = datetime.today() + start_date = today - timedelta(days=n_days) + start_date_str = start_date.strftime("%Y/%m/%d") + return self.get_gse_by_date(start_date_str) + + def get_gse_by_date(self, start_date: str, end_date: str = None) -> list: + """ + Search gse accessions by providing start date and end date. By default, the last date is today. + :param start_date: the oldest date of update (from YYYY/MM/DD to now) [input format: 'YYYY/MM/DD'] + :param end_date: the nearest date of update (from __ to YYYY/MM/DD) [input format: 'YYYY/MM/DD'] + :return: list of gse accessions + """ + if end_date is None: + end_date = TODAY_DATE + new_date_filter = DATE_FILTER.format(start_date=start_date, end_date=end_date) + return self.get_gse_id_by_query(url=self._compose_url(new_date_filter)) + + def get_gse_id_by_query(self, url: str) -> list: + """ + Run esearch (ncbi search tool) by specifying URL and retrieve gse list result + :param url: url of the query + :return: list of gse ids + """ + uids_list = self._run_search_query(url) + gse_id_list = [self.uid_to_gse(d) for d in uids_list] + self.last_result = gse_id_list + return gse_id_list + + @staticmethod + def uid_to_gse(uid: str) -> str: + """ + UID to GES accession converter + :param uid: uid string (Unique Identifier Number in GEO) + :return: GSE id string + """ + uid_regex = re.compile(r"[1-9]+0+([1-9]+[0-9]*)") + return "GSE" + uid_regex.match(uid).group(1) + + @staticmethod + def find_differences(old_list: list, new_list: list) -> list: + """ + Compare 2 lists and search for elements that are not in old list + :param old_list: old list of elements + :param new_list: new list of elements + :return: list of elements that are not in old list but are in new_list + """ + return list(set(new_list) - set(old_list)) + + @staticmethod + def _run_search_query(url: str) -> list: + """ + Run get request and return list of uids found + :param url: url of the query + :return: list of UIDs + """ + x = requests.get(url) + if x.status_code != 200: + _LOGGER.error(f"Request status != 200. Error. Check your request") + return [] + try: + x_result = xmltodict.parse(x.text)["eSearchResult"] + _LOGGER.info(f"Found elements: {x_result['Count']}") + _LOGGER.info(f"Additional information: {x_result['TranslationSet']}") + + return x_result["IdList"]["Id"] + except Exception: + return [] + + @staticmethod + def _create_filter_str(filters: str = None) -> str: + """ + Tune filter for url request + :param filters: filter should look like here: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag + :return: tuned filter string + """ + if filters == "" or filters is None: + return "" + return f"+(AND+{filters})" + + def _compose_url(self, date_filter: str = None) -> str: + """ + Compose final url by adding date filter + :param date_filter: date filter that has to be used in the query + :return: string of final url + """ + if date_filter is None: + date_filter = "" + + return f"{ETOOLS_GEO_GSE_BASE}{self.query_filter_str}{date_filter}{self.query_customized_ending}" + + def generate_file(self, file_path: str, gse_list: list = None): + """ + Save the list of GSE accessions stored in this Finder object to a given file + :param file_path: root to the file where gse accessions have to be saved + :param gse_list: list of gse accessions + :return: NoReturn + """ + if gse_list is None: + gse_list = self.last_result + file_dir = os.path.split(file_path)[0] + if not os.path.exists(file_dir) and file_dir != "": + _LOGGER.error(f"Path: '{file_dir}' does not exist! No file will be saved") + + with open(file_path, "w") as fp: + for item in gse_list: + fp.write("%s\n" % item) + _LOGGER.info("File has been saved!") diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 82244d3..ecb0480 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -2,20 +2,20 @@ __author__ = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"] - -import argparse import copy import csv import os -import re import sys -from string import punctuation -import requests -import xmltodict +# from string import punctuation # import tarfile +import requests +import xmltodict +import yaml import time +from .cli import _parse_cmdl +from .const import * from .utils import ( Accession, parse_accessions, @@ -23,90 +23,134 @@ convert_size, clean_soft_files, run_subprocess, + _get_list_of_keys, + _get_value, + _read_tar_filelist, + _check_file_existance, + _separate_list_of_files, + _update_columns, + _sanitize_name, + _sanitize_config_string, + _create_dot_yaml, + _which, + _dict_to_list_converter, + _standardize_colnames, + _separate_file_url, + _filter_gsm, + _unify_list_keys, ) -from ._version import __version__ -import logmuse +from rich.progress import track +import re +import logmuse from ubiquerg import expandpath, is_command_callable - -_STRING_TYPES = str -_LOGGER = None - -# A set of hard-coded keys if you want to limit to just a few instead of taking -# all information provided in GEO. Use with `--use-key-subset` -ANNOTATION_SHEET_KEYS = [ - "sample_name", - "protocol", - "read_type", - "organism", - "data_source", - "Sample_title", - "Sample_source_name_ch1", - "Sample_organism_ch1", - "Sample_library_selection", - "Sample_library_strategy", - "Sample_type", - "SRR", - "SRX", - "Sample_geo_accession", - "Sample_series_id", - "Sample_instrument_model", -] - -# Regex to parse out SRA accession identifiers -PROJECT_PATTERN = re.compile(r"(SRP\d{4,8})") -EXPERIMENT_PATTERN = re.compile(r"(SRX\d{4,8})") -GSE_PATTERN = re.compile(r"(GSE\d{4,8})") -SUPP_FILE_PATTERN = re.compile("Sample_supplementary_file") -SER_SUPP_FILE_PATTERN = re.compile("Series_supplementary_file") - -SAMPLE_SUPP_METADATA_FILE = "_samples.csv" -EXP_SUPP_METADATA_FILE = "_series.csv" - -# How many times should we retry failing prefetch call? -NUM_RETRIES = 3 -REQUEST_SLEEP = 0.4 - -NCBI_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={SRP_NUMBER}&retmax=999&rettype=uilist&retmode=json" -NCBI_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={ID}&rettype=runinfo&retmode=xml" +from typing import List, Union, Dict, Tuple, NoReturn +import peppy +import pandas as pd class Geofetcher: + """ + Class to download or get projects, metadata, data from GEO and SRA + """ + def __init__( self, - name="", - metadata_root="", - metadata_folder="", - just_metadata=False, - refresh_metadata=False, - config_template=None, - pipeline_samples=None, - pipeline_project=None, - skip=0, - acc_anno=False, - use_key_subset=False, - processed=True, - data_source="samples", - filter=None, - filter_size=None, - geo_folder=".", - split_experiments=False, - bam_folder="", - fq_folder="", - sra_folder="", - bam_conversion=False, - picard_path="", - input=None, - const_limit_project=50, - const_limit_discard=250, - attr_limit_truncate=500, - discard_soft=False, - add_dotfile=False, + name: str = "", + metadata_root: str = "", + metadata_folder: str = "", + just_metadata: bool = False, + refresh_metadata: bool = False, + config_template: str = None, + pipeline_samples: str = None, + pipeline_project: str = None, + skip: int = 0, + acc_anno: bool = False, + use_key_subset: bool = False, + processed: bool = False, + data_source: str = "samples", + filter: str = None, + filter_size: str = None, + geo_folder: str = ".", + split_experiments: bool = False, + bam_folder: str = "", + fq_folder: str = "", + sra_folder: str = "", + bam_conversion: bool = False, + picard_path: str = "", + input: str = None, + const_limit_project: int = 50, + const_limit_discard: int = 250, + attr_limit_truncate: int = 500, + discard_soft: bool = False, + add_dotfile: bool = False, + disable_progressbar: bool = False, + add_convert_modifier: bool = False, opts=None, **kwargs, ): + """ + init function + :param input: GSEnumber or path to the input file + :param name: Specify a project name. Defaults to GSE number or name of accessions file name + :param metadata_root: Specify a parent folder location to store metadata. + The project name will be added as a subfolder [Default: $SRAMETA:] + :param metadata_folder: Specify an absolute folder location to store metadata. No subfolder will be added. + Overrides value of --metadata-root [Default: Not used (--metadata-root is used by default)] + :param just_metadata: If set, don't actually run downloads, just create metadata + :param refresh_metadata: If set, re-download metadata even if it exists. + :param config_template: Project config yaml file template. + :param pipeline_samples: Specify one or more filepaths to SAMPLES pipeline interface yaml files. + These will be added to the project config file to make it immediately compatible with looper. + [Default: null] + :param pipeline_project: Specify one or more filepaths to PROJECT pipeline interface yaml files. + These will be added to the project config file to make it immediately compatible with looper. + [Default: null] + :param acc_anno: Produce annotation sheets for each accession. + Project combined PEP for the whole project won't be produced. + :param discard_soft: Create project without downloading soft files on the disc + :param add_dotfile: Add .pep.yaml file that points .yaml PEP file + :param disable_progressbar: Set true to disable progressbar + + :param const_limit_project: Optional: Limit of the number of the constant sample characters + that should not be in project yaml. [Default: 50] + :param const_limit_discard: Optional: Limit of the number of the constant sample characters + that should not be discarded [Default: 250] + :param attr_limit_truncate: Optional: Limit of the number of sample characters. + Any attribute with more than X characters will truncate to the first X, where X is a number of characters + [Default: 500] + + :param processed: Download processed data [Default: download raw data]. + :param data_source: Specifies the source of data on the GEO record to retrieve processed data, + which may be attached to the collective series entity, or to individual samples. Allowable values are: + samples, series or both (all). Ignored unless 'processed' flag is set. [Default: samples] + :param filter: Filter regex for processed filenames [Default: None].Ignored unless 'processed' flag is set. + :param filter_size: Filter size for processed files that are stored as sample repository [Default: None]. + Works only for sample data. Supported input formats : 12B, 12KB, 12MB, 12GB. + Ignored unless 'processed' flag is set. + :param geo_folder: Specify a location to store processed GEO files. + Ignored unless 'processed' flag is set.[Default: $GEODATA:] + + :param split_experiments: Split SRR runs into individual samples. By default, SRX experiments with multiple SRR + Runs will have a single entry in the annotation table, with each run as a separate row in the + subannotation table. This setting instead treats each run as a separate sample [Works with raw data] + :param bam_folder: Optional: Specify folder of bam files. Geofetch will not download sra files when + corresponding bam files already exist. [Default: $SRABAM:] [Works with raw data] + :param fq_folder: Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding + fastq files already exist. [Default: $SRAFQ:] [Works with raw data] + :param use_key_subset: Use just the keys defined in this module when writing out metadata. [Works with raw data] + :param sra_folder: Optional: Specify a location to store sra files + [Default: $SRARAW:" + safe_echo("SRARAW") + ] + :param bam_conversion: Optional: set True to convert bam files [Works with raw data] + :param picard_path: Specify a path to the picard jar, if you want to convert fastq to bam + [Default: $PICARD:" + safe_echo("PICARD") + "] [Works with raw data] + :param add_convert_modifier: Add looper SRA convert modifier to config file. + + :param skip: Skip some accessions. [Default: no skip]. + :param opts: opts object [Optional] + :param kwargs: other values + """ - global _LOGGER if opts is not None: _LOGGER = logmuse.logger_via_cli(opts) else: @@ -125,18 +169,18 @@ def __init__( if metadata_folder: self.metadata_expanded = expandpath(metadata_folder) if os.path.isabs(self.metadata_expanded): - self.metadata_raw = metadata_folder + self.metadata_root_full = metadata_folder else: self.metadata_expanded = os.path.abspath(self.metadata_expanded) - self.metadata_raw = os.path.abspath(metadata_root) - self.metadata_raw = metadata_folder + self.metadata_root_full = os.path.abspath(metadata_root) + self.metadata_root_full = metadata_folder else: self.metadata_expanded = expandpath(metadata_root) if os.path.isabs(self.metadata_expanded): - self.metadata_raw = metadata_root + self.metadata_root_full = metadata_root else: self.metadata_expanded = os.path.abspath(self.metadata_expanded) - self.metadata_raw = os.path.abspath(metadata_root) + self.metadata_root_full = os.path.abspath(metadata_root) self.just_metadata = just_metadata self.refresh_metadata = refresh_metadata @@ -174,7 +218,9 @@ def __init__( self.metadata_expanded = os.path.join( self.metadata_expanded, self.project_name ) - self.metadata_raw = os.path.join(self.metadata_raw, self.project_name) + self.metadata_root_full = os.path.join( + self.metadata_root_full, self.project_name + ) if filter_size is not None: try: @@ -199,57 +245,140 @@ def __init__( self.discard_soft = discard_soft self.add_dotfile = add_dotfile - + self.disable_progressbar = disable_progressbar + self.add_convert_modifier = add_convert_modifier self._LOGGER.info(f"Metadata folder: {self.metadata_expanded}") - # check to make sure prefetch is callable - if not just_metadata and not processed: - if not is_command_callable("prefetch"): - raise SystemExit( - "To download raw data You must first install the sratoolkit, with prefetch in your PATH." - " Installation instruction: http://geofetch.databio.org/en/latest/install/" - ) - # Some sanity checks before proceeding - if bam_conversion and not just_metadata and not self.which("samtools"): + if bam_conversion and not just_metadata and not _which("samtools"): raise SystemExit("For SAM/BAM processing, samtools should be on PATH.") - def fetch_all(self, input, name=None): - """Main script driver/workflow""" + self.just_object = False - if name: + def get_projects( + self, input: str, just_metadata: bool = True, discard_soft: bool = True + ) -> dict: + """ + Function for fetching projects from GEO|SRA and receiving peppy project + :param input: GSE number, or path to file of GSE numbers + :param just_metadata: process only metadata + :param discard_soft: clean run, without downloading soft files + :return: peppy project or list of project, if acc_anno is set. + """ + self.just_metadata = just_metadata + self.just_object = True + self.discard_soft = discard_soft + acc_GSE_list = parse_accessions( + input, self.metadata_expanded, self.just_metadata + ) + + project_dict = {} + + # processed data: + if self.processed: + if self.acc_anno: + self.disable_progressbar = True + nkeys = len(acc_GSE_list.keys()) + ncount = 0 + self.acc_anno = False + for acc_GSE in acc_GSE_list.keys(): + ncount += 1 + self._LOGGER.info( + f"\033[38;5;200mProcessing accession {ncount} of {nkeys}: '{acc_GSE}'\033[0m" + ) + project_dict.update(self.fetch_all(input=acc_GSE, name=acc_GSE)) + else: + try: + project_n = os.path.splitext(os.path.basename(input))[0] + except TypeError: + project_n = input + project_dict.update(self.fetch_all(input=input, name=project_n)) + + # raw data: + else: + # Not sure about below code... + if self.acc_anno: + self.disable_progressbar = True + self.acc_anno = False + nkeys = len(acc_GSE_list.keys()) + ncount = 0 + for acc_GSE in acc_GSE_list.keys(): + ncount += 1 + self._LOGGER.info( + f"\033[38;5;200mProcessing accession {ncount} of {nkeys}: '{acc_GSE}'\033[0m" + ) + project = self.fetch_all(input=acc_GSE) + project_dict[acc_GSE + "_raw"] = project + + else: + try: + project_n = os.path.splitext(os.path.basename(input))[0] + except TypeError: + project_n = input + ser_dict = self.fetch_all(input=input) + project_dict[project_n + "_raw"] = ser_dict + + new_pr_dict = {} + for pr_key in project_dict.keys(): + if project_dict[pr_key]: + new_pr_dict[pr_key] = project_dict[pr_key] + + return new_pr_dict + + def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Project]: + """ + Main function driver/workflow + Function that search, filters, downloads and save data and metadata from GEO and SRA + :param input: GSE or input file with gse's + :param name: Name of the project + :return: NoReturn or peppy Project + """ + + if name is not None: self.project_name = name else: - self.project_name = os.path.splitext(os.path.basename(input))[0] + try: + self.project_name = os.path.splitext(os.path.basename(input))[0] + except TypeError: + self.project_name = input + + # check to make sure prefetch is callable + if not self.just_metadata and not self.processed: + if not is_command_callable("prefetch"): + raise SystemExit( + "To download raw data You must first install the sratoolkit, with prefetch in your PATH." + " Installation instruction: http://geofetch.databio.org/en/latest/install/" + ) acc_GSE_list = parse_accessions( input, self.metadata_expanded, self.just_metadata ) - # Loop through each accession. - # This will process that accession, produce metadata and download files for - # the GSM #s included in the list for each GSE#. - # acc_GSE = "GSE61150" # example + metadata_dict_combined = {} + subannotation_dict_combined = {} - # This loop populates a list of metadata. - metadata_dict = {} - subannotation_dict = {} - failed_runs = [] processed_metadata_samples = [] - processed_metadata_exp = [] + processed_metadata_series = [] acc_GSE_keys = acc_GSE_list.keys() nkeys = len(acc_GSE_keys) ncount = 0 - for acc_GSE in acc_GSE_list.keys(): + for acc_GSE in track( + acc_GSE_list.keys(), + description="Processing... ", + disable=self.disable_progressbar, + ): + ncount += 1 if ncount <= self.skip: continue elif ncount == self.skip + 1: self._LOGGER.info(f"Skipped {self.skip} accessions. Starting now.") - self._LOGGER.info( - f"\033[38;5;200mProcessing accession {ncount} of {nkeys}: '{acc_GSE}'\033[0m" - ) + + if not self.just_object or not self.acc_anno: + self._LOGGER.info( + f"\033[38;5;200mProcessing accession {ncount} of {nkeys}: '{acc_GSE}'\033[0m" + ) if len(re.findall(GSE_PATTERN, acc_GSE)) != 1: self._LOGGER.debug(len(re.findall(GSE_PATTERN, acc_GSE))) @@ -263,557 +392,547 @@ def fetch_all(self, input, name=None): f"Limit to: {list(acc_GSE_list[acc_GSE])}" ) # a list of GSM#s - if self.refresh_metadata: - self._LOGGER.info("Refreshing metadata...") - # For each GSE acc, produce a series of metadata files file_gse = os.path.join(self.metadata_expanded, acc_GSE + "_GSE.soft") file_gsm = os.path.join(self.metadata_expanded, acc_GSE + "_GSM.soft") file_sra = os.path.join(self.metadata_expanded, acc_GSE + "_SRA.csv") - # Grab the GSE and GSM SOFT files from GEO. - # The GSE file has metadata describing the experiment, which includes - # The SRA number we need to download the raw data from SRA - # The GSM file has metadata describing each sample, which we will use to - # produce a sample annotation sheet. if not os.path.isfile(file_gse) or self.refresh_metadata: - Accession(acc_GSE).fetch_metadata(file_gse) + file_gse_content = Accession(acc_GSE).fetch_metadata( + file_gse, clean=self.discard_soft + ) else: self._LOGGER.info(f"Found previous GSE file: {file_gse}") + gse_file_obj = open(file_gse, "r") + file_gse_content = gse_file_obj.read().split("\n") + file_gse_content = [elem for elem in file_gse_content if len(elem) > 0] if not os.path.isfile(file_gsm) or self.refresh_metadata: - Accession(acc_GSE).fetch_metadata(file_gsm, typename="GSM") + file_gsm_content = Accession(acc_GSE).fetch_metadata( + file_gsm, typename="GSM", clean=self.discard_soft + ) else: self._LOGGER.info(f"Found previous GSM file: {file_gsm}") + gsm_file_obj = open(file_gsm, "r") + file_gsm_content = gsm_file_obj.read().split("\n") + file_gsm_content = [elem for elem in file_gsm_content if len(elem) > 0] - # if not os.path.isfile(file_gsm) or not os.path.isfile(file_gse): + gsm_enter_dict = acc_GSE_list[acc_GSE] # download processed data if self.processed: - try: - ( - meta_processed_samples, - meta_processed_series, - ) = self.get_list_of_processed_files(file_gse, file_gsm) - - # taking into account list of GSM that is specified in the input file - gsm_list = acc_GSE_list[acc_GSE] - meta_processed_samples = self.filter_gsm( - meta_processed_samples, gsm_list + ( + meta_processed_samples, + meta_processed_series, + ) = self.fetch_processed_one( + gse_file_content=file_gse_content, + gsm_file_content=file_gsm_content, + gsm_filter_list=gsm_enter_dict, + ) + + # download processed files: + if not self.just_metadata: + self._download_processed_data( + acc_gse=acc_GSE, + meta_processed_samples=meta_processed_samples, + meta_processed_series=meta_processed_series, ) - # Unify keys: - meta_processed_samples = self.unify_list_keys( - meta_processed_samples + + # generating PEPs for processed files: + if self.acc_anno: + self._generate_processed_meta( + acc_GSE, meta_processed_samples, meta_processed_series ) - meta_processed_series = self.unify_list_keys(meta_processed_series) - list_of_keys = self.get_list_of_keys(meta_processed_samples) - self._LOGGER.info("Expanding metadata list...") - for key_in_list in list_of_keys: - meta_processed_samples = self.expand_metadata_list( - meta_processed_samples, key_in_list - ) + else: + # adding metadata from current experiment to the project + processed_metadata_samples.extend(meta_processed_samples) + processed_metadata_series.extend(meta_processed_series) - list_of_keys_series = self.get_list_of_keys(meta_processed_series) - self._LOGGER.info("Expanding metadata list...") - for key_in_list in list_of_keys_series: - meta_processed_series = self.expand_metadata_list( - meta_processed_series, key_in_list - ) + else: + # read gsm metadata + gsm_metadata = self._read_gsm_metadata( + acc_GSE, acc_GSE_list, file_gsm_content + ) - # convert column names to lowercase and underscore - meta_processed_samples = self.standardize_colnames( - meta_processed_samples - ) - meta_processed_series = self.standardize_colnames( - meta_processed_series + # download sra metadata + srp_list_result = self._get_SRA_meta( + file_gse_content, gsm_metadata, file_sra + ) + if not srp_list_result: + self._LOGGER.info(f"No SRP data, continuing ....") + self._LOGGER.warning(f"No raw pep will be created! ....") + # delete current acc if no raw data was found + # del metadata_dict[acc_GSE] + pass + else: + self._LOGGER.info("Parsing SRA file to download SRR records") + gsm_multi_table, gsm_metadata, runs = self._process_sra_meta( + srp_list_result, gsm_enter_dict, gsm_metadata + ) + + # download raw data: + if not self.just_metadata: + for run in runs: + # download raw data + self._LOGGER.info(f"Getting SRR: {run} in ({acc_GSE})") + self._download_raw_data(run) + else: + self._LOGGER.info(f"Dry run, no data will be downloaded") + + # save one project + if self.acc_anno and nkeys > 1: + self._write_raw_annotation_new( + name=acc_GSE, + metadata_dict=gsm_metadata, + subannot_dict=gsm_multi_table, ) - if not self.acc_anno: - # adding metadata from current experiment to the project - processed_metadata_samples.extend(meta_processed_samples) - processed_metadata_exp.extend(meta_processed_series) - - # save PEP for each accession if acc-anno flag is true - if self.acc_anno and len(acc_GSE_list.keys()) > 1: - if self.supp_by == "all": - # samples - pep_acc_path_sample = os.path.join( - self.metadata_raw, - f"{acc_GSE}_samples", - acc_GSE + SAMPLE_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - meta_processed_samples, pep_acc_path_sample - ) + else: + metadata_dict_combined.update(gsm_metadata) + subannotation_dict_combined.update(gsm_multi_table) - # series - pep_acc_path_exp = os.path.join( - self.metadata_raw, - f"{acc_GSE}_series", - acc_GSE + EXP_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - meta_processed_series, pep_acc_path_exp - ) - elif self.supp_by == "samples": - pep_acc_path_sample = os.path.join( - self.metadata_raw, - f"{acc_GSE}_samples", - acc_GSE + SAMPLE_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - meta_processed_samples, pep_acc_path_sample - ) - elif self.supp_by == "series": - pep_acc_path_exp = os.path.join( - self.metadata_raw, - f"{acc_GSE}_series", - acc_GSE + EXP_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - meta_processed_series, pep_acc_path_exp - ) + self._LOGGER.info(f"Finished processing {len(acc_GSE_list)} accession(s)") - if not self.just_metadata: - data_geo_folder = os.path.join(self.geo_folder, acc_GSE) - self._LOGGER.debug("Data folder: " + data_geo_folder) + # Logging cleaning process: + if self.discard_soft: + self._LOGGER.info(f"Cleaning soft files ...") + clean_soft_files(self.metadata_root_full) - if self.supp_by == "all": - processed_samples_files = [ - each_file["file_url"] - for each_file in meta_processed_samples - ] - for file_url in processed_samples_files: - self.download_processed_file(file_url, data_geo_folder) + ####################################################################################### - processed_series_files = [ - each_file["file_url"] - for each_file in meta_processed_series - ] - for file_url in processed_series_files: - self.download_processed_file(file_url, data_geo_folder) + # saving PEPs for processed data + if self.processed: + if not self.acc_anno: + return_value = self._generate_processed_meta( + name=self.project_name, + meta_processed_samples=processed_metadata_samples, + meta_processed_series=processed_metadata_series, + ) + if self.just_object: + return return_value - elif self.supp_by == "samples": - processed_samples_files = [ - each_file["file_url"] - for each_file in meta_processed_samples - ] - for file_url in processed_samples_files: - self.download_processed_file(file_url, data_geo_folder) + # saving PEPs for raw data + else: + return_value = self._write_raw_annotation_new( + "PEP", metadata_dict_combined, subannotation_dict_combined + ) + if self.just_object: + return return_value - elif self.supp_by == "series": - processed_series_files = [ - each_file["file_url"] - for each_file in meta_processed_series - ] - for file_url in processed_series_files: - self.download_processed_file(file_url, data_geo_folder) - except Exception as processed_exception: - failed_runs.append(acc_GSE) - self._LOGGER.warning(f"Error occurred: {processed_exception}") + def _process_sra_meta( + self, + srp_list_result: list = None, + gsm_enter_dict: dict = None, + gsm_metadata: dict = None, + ): + """ + Create srp multitable and update gsm_metadata based on srp + :param srp_list_result: list of srp got from sra file + :param gsm_enter_dict: gsm enter content + :param gsm_metadata: dict of samples of gsm + :return: srp multitable + """ + gsm_multi_table = {} + runs = [] + for line in srp_list_result: + + # Only download if it's in the include list: + experiment = line["Experiment"] + run_name = line["Run"] + if experiment not in gsm_metadata: + # print(f"Skipping: {experiment}") + continue - else: - # download gsm metadata - gsm_metadata = self.get_gsm_metadata(acc_GSE, acc_GSE_list, file_gsm) - metadata_dict[acc_GSE] = gsm_metadata + sample_name = None + try: + sample_name = gsm_enter_dict[gsm_metadata[experiment]["gsm_id"]] + except KeyError: + # No name in input file + pass + + if not sample_name or sample_name == "": + temp = gsm_metadata[experiment]["Sample_title"] + sample_name = _sanitize_name(temp) + + # Otherwise, record that there's SRA data for this run. + # And set a few columns that are used as input to the Looper + # print("Updating columns for looper") + _update_columns( + gsm_metadata, + experiment, + sample_name=sample_name, + read_type=line["LibraryLayout"], + ) - # download gsm metadata - SRP_list_result = self.get_SRA_meta(file_gse, gsm_metadata, file_sra) - if not SRP_list_result: - # delete current acc if no raw data was found - # del metadata_dict[acc_GSE] - continue - # Parse metadata from SRA - # Produce an annotated output from the GSM and SRARunInfo files. - # This will merge the GSM and SRA sample metadata into a dict of dicts, - # with one entry per sample. - # NB: There may be multiple SRA Runs (and thus lines in the RunInfo file) - # Corresponding to each sample. - # For multi samples (samples with multiple runs), we keep track of these - # relations in a separate table, which is called the subannotation table. - - gsm_multi_table = {} - self._LOGGER.info("Parsing SRA file to download SRR records") - - for line in SRP_list_result: - - # Only download if it's in the include list: - experiment = line["Experiment"] - run_name = line["Run"] - if experiment not in gsm_metadata: - # print(f"Skipping: {experiment}") - continue - - # local convenience variable - # possibly set in the input tsv file - sample_name = None # initialize to empty - try: - sample_name = acc_GSE_list[acc_GSE][ - gsm_metadata[experiment]["gsm_id"] + # Some experiments are flagged in SRA as having multiple runs. + if gsm_metadata[experiment].get("SRR") is not None: + # This SRX number already has an entry in the table. + self._LOGGER.debug(f"Found additional run: {run_name} ({experiment})") + if ( + isinstance(gsm_metadata[experiment]["SRR"], str) + and experiment not in gsm_multi_table + ): + gsm_multi_table[experiment] = [] + + gsm_multi_table[experiment].append( + [ + sample_name, + experiment, + gsm_metadata[experiment]["SRR"], ] - except KeyError: - self._LOGGER.info( - f"sample_name does not exist, creating new..." - ) - if not sample_name or sample_name == "": - temp = gsm_metadata[experiment]["Sample_title"] - sample_name = self.sanitize_name(temp) - - # Otherwise, record that there's SRA data for this run. - # And set a few columns that are used as input to the Looper - # print("Updating columns for looper") - self.update_columns( - gsm_metadata, - experiment, - sample_name=sample_name, - read_type=line["LibraryLayout"], + ) + gsm_multi_table[experiment].append( + [sample_name, experiment, run_name] + ) + else: + gsm_multi_table[experiment].append( + [sample_name, experiment, run_name] ) - # Some experiments are flagged in SRA as having multiple runs. - if gsm_metadata[experiment].get("SRR") is not None: - # This SRX number already has an entry in the table. - self._LOGGER.info( - f"Found additional run: {run_name} ({experiment})" - ) + if self.split_experiments: + rep_number = len(gsm_multi_table[experiment]) + new_SRX = experiment + "_" + str(rep_number) + gsm_metadata[new_SRX] = copy.copy(gsm_metadata[experiment]) + # gsm_metadata[new_SRX]["SRX"] = new_SRX + gsm_metadata[new_SRX]["sample_name"] += "_" + str(rep_number) + gsm_metadata[new_SRX]["SRR"] = run_name + else: + # Either way, set the srr code to multi in the main table. + gsm_metadata[experiment]["SRR"] = "multi" + else: + # The first SRR for this SRX is added to GSM metadata + gsm_metadata[experiment]["SRR"] = run_name + runs.append(run_name) - if ( - isinstance(gsm_metadata[experiment]["SRR"], _STRING_TYPES) - and experiment not in gsm_multi_table - ): - # Only one has been stuck in so far, make a list - gsm_multi_table[experiment] = [] - # Add first the original one, which was stored as a string - # previously - gsm_multi_table[experiment].append( - [ - sample_name, - experiment, - gsm_metadata[experiment]["SRR"], - ] - ) - # Now append the current SRR number in a list as [SRX, SRR] - gsm_multi_table[experiment].append( - [sample_name, experiment, run_name] - ) - else: - # this is the 3rd or later sample; the first two are done, - # so just add it. - gsm_multi_table[experiment].append( - [sample_name, experiment, run_name] - ) + return gsm_multi_table, gsm_metadata, runs - if self.split_experiments: - # Duplicate the gsm metadata for this experiment (copy to make sure - # it's not just an alias). - rep_number = len(gsm_multi_table[experiment]) - new_SRX = experiment + "_" + str(rep_number) - gsm_metadata[new_SRX] = copy.copy(gsm_metadata[experiment]) - # gsm_metadata[new_SRX]["SRX"] = new_SRX - gsm_metadata[new_SRX]["sample_name"] += "_" + str( - rep_number - ) - gsm_metadata[new_SRX]["SRR"] = run_name - else: - # Either way, set the srr code to multi in the main table. - gsm_metadata[experiment]["SRR"] = "multi" - else: - # The first SRR for this SRX is added to GSM metadata - gsm_metadata[experiment]["SRR"] = run_name - - self._LOGGER.info(f"Getting SRR: {run_name} ({experiment})") - bam_file = ( - "" - if self.bam_folder == "" - else os.path.join(self.bam_folder, run_name + ".bam") - ) - fq_file = ( - "" - if self.fq_folder == "" - else os.path.join(self.fq_folder, run_name + "_1.fq") - ) + def _download_raw_data(self, run_name: str) -> NoReturn: + """ + Downloade raw data from SRA by providing run name + :param run_name: Run name from SRA + :return: NoReturn + """ + bam_file = ( + "" + if self.bam_folder == "" + else os.path.join(self.bam_folder, run_name + ".bam") + ) + fq_file = ( + "" + if self.fq_folder == "" + else os.path.join(self.fq_folder, run_name + "_1.fq") + ) + if os.path.exists(bam_file): + self._LOGGER.info(f"BAM found: {bam_file} . Skipping...") + elif os.path.exists(fq_file): + self._LOGGER.info(f"FQ found: {fq_file} .Skipping...") + else: + try: + self._download_SRA_file(run_name) + except Exception as err: + self._LOGGER.warning( + f"Error occurred while downloading SRA file: {err}" + ) + + if self.bam_conversion and self.bam_folder != "": + try: + # converting sra to bam using # TODO: sam-dump has a built-in prefetch. I don't have to do # any of this stuff... This also solves the bad sam-dump issues. + self._sra_to_bam_conversion_sam_dump(bam_file, run_name) - if os.path.exists(bam_file): - self._LOGGER.info(f"BAM found: {bam_file} . Skipping...") - elif os.path.exists(fq_file): - self._LOGGER.info(f"FQ found: {fq_file} .Skipping...") - else: - if not self.just_metadata: - try: - self.download_SRA_file(run_name) - except Exception as err: - failed_runs.append(run_name) - self._LOGGER.warning( - f"Error occurred while downloading SRA file: {err}" - ) - else: - self._LOGGER.info("Dry run (no raw data will be download)") + # checking if bam_file converted correctly, if not --> use fastq-dump + st = os.stat(bam_file) + if st.st_size < 100: + self._LOGGER.warning( + "Bam conversion failed with sam-dump. Trying fastq-dump..." + ) + self._sra_to_bam_conversion_fastq_damp( + bam_file, run_name, self.picard_path + ) - if self.bam_conversion and self.bam_folder != "": - try: - # converting sra to bam using - self.sra_bam_conversion(bam_file, run_name) + except FileNotFoundError as err: + self._LOGGER.info( + f"SRA file doesn't exist, please download it first: {err}" + ) - # checking if bam_file converted correctly, if not --> use fastq-dump - st = os.stat(bam_file) - if st.st_size < 100: - self._LOGGER.warning( - "Bam conversion failed with sam-dump. Trying fastq-dump..." - ) - self.sra_bam_conversion2( - bam_file, run_name, self.picard_path - ) + def fetch_processed_one( + self, + gse_file_content: list, + gsm_file_content: list, + gsm_filter_list: dict, + ) -> Tuple: + """ + Fetche one processed GSE project and return its metadata + :param gsm_file_content: gse soft file content + :param gse_file_content: gsm soft file content + :param gsm_filter_list: list of gsm that have to be downloaded + :return: Tuple of project list of gsm samples and gse samples + """ + ( + meta_processed_samples, + meta_processed_series, + ) = self._get_list_of_processed_files(gse_file_content, gsm_file_content) - except FileNotFoundError as err: - self._LOGGER.info( - f"SRA file doesn't exist, please download it first: {err}" - ) + # taking into account list of GSM that is specified in the input file + meta_processed_samples = _filter_gsm(meta_processed_samples, gsm_filter_list) - # accumulate subannotations - subannotation_dict[acc_GSE] = gsm_multi_table + # samples + meta_processed_samples = self._expand_metadata_list(meta_processed_samples) - # Logging additional information about processing - self._LOGGER.info(f"Finished processing {len(acc_GSE_list)} accession(s)") + # series + meta_processed_series = self._expand_metadata_list(meta_processed_series) - # Logging cleaning process: - if self.discard_soft: - self._LOGGER.info(f"Cleaning soft files ...") - clean_soft_files(self.metadata_raw) + # convert column names to lowercase and underscore + meta_processed_samples = _standardize_colnames(meta_processed_samples) + meta_processed_series = _standardize_colnames(meta_processed_series) + + return meta_processed_samples, meta_processed_series - if len(failed_runs) > 0: - self._LOGGER.warn( - f"The following samples could not be downloaded: {failed_runs}" + def _generate_processed_meta( + self, name: str, meta_processed_samples: list, meta_processed_series: list + ) -> dict: + """ + Generate and save PEPs for processed accessions. GEO has data in GSE and GSM, + conditions are used to decide which PEPs will be saved. + :param name: name of the folder/file where PEP will be saved + :param meta_processed_samples: + :param meta_processed_series: + :return: dict of objects if just_object is set, otherwise dicts of None + """ + return_objects = {f"{name}_samples": None, f"{name}_series": None} + + if self.supp_by == "all": + # samples + pep_acc_path_sample = os.path.join( + self.metadata_root_full, + f"{name}_samples", + name + SAMPLE_SUPP_METADATA_FILE, + ) + return_objects[f"{name}_samples"] = self._write_processed_annotation( + meta_processed_samples, + pep_acc_path_sample, + just_object=self.just_object, ) - ####################################################################################### + # series + pep_acc_path_exp = os.path.join( + self.metadata_root_full, + f"{name}_series", + name + EXP_SUPP_METADATA_FILE, + ) + return_objects[f"{name}_series"] = self._write_processed_annotation( + meta_processed_series, + pep_acc_path_exp, + just_object=self.just_object, + ) - # saving PEPs for processed data - if self.processed: - if not self.acc_anno: - if self.supp_by == "all": - supp_sample_path_meta = os.path.join( - self.metadata_raw, - "PEP_samples", - self.project_name + SAMPLE_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - processed_metadata_samples, supp_sample_path_meta - ) + elif self.supp_by == "samples": + pep_acc_path_sample = os.path.join( + self.metadata_root_full, + f"{name}_samples", + name + SAMPLE_SUPP_METADATA_FILE, + ) + return_objects[f"{name}_samples"] = self._write_processed_annotation( + meta_processed_samples, + pep_acc_path_sample, + just_object=self.just_object, + ) + elif self.supp_by == "series": + return_objects[f"{name}_series"] = pep_acc_path_exp = os.path.join( + self.metadata_root_full, + f"{name}_series", + name + EXP_SUPP_METADATA_FILE, + ) + self._write_processed_annotation( + meta_processed_series, + pep_acc_path_exp, + just_object=self.just_object, + ) - supp_series_path_meta = os.path.join( - self.metadata_raw, - "PEP_series", - self.project_name + EXP_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - processed_metadata_exp, supp_series_path_meta - ) + return return_objects - elif self.supp_by == "samples": - supp_sample_path_meta = os.path.join( - self.metadata_raw, - "PEP_samples", - self.project_name + SAMPLE_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - processed_metadata_samples, supp_sample_path_meta - ) + def _download_processed_data( + self, acc_gse: str, meta_processed_samples: list, meta_processed_series: list + ) -> NoReturn: + """ + Download processed data from GEO by providing project annotation list + :param acc_gse: accession number of the project + :param meta_processed_samples: list of annotation of samples + :param meta_processed_series: list of annotation of series + :return: Noreturn + """ + data_geo_folder = os.path.join(self.geo_folder, acc_gse) + self._LOGGER.debug("Data folder: " + data_geo_folder) + + if self.supp_by == "all": + processed_samples_files = [ + each_file["file_url"] for each_file in meta_processed_samples + ] + for file_url in processed_samples_files: + self._download_processed_file(file_url, data_geo_folder) + + processed_series_files = [ + each_file["file_url"] for each_file in meta_processed_series + ] + for file_url in processed_series_files: + self._download_processed_file(file_url, data_geo_folder) + + elif self.supp_by == "samples": + processed_samples_files = [ + each_file["file_url"] for each_file in meta_processed_samples + ] + for file_url in processed_samples_files: + self._download_processed_file(file_url, data_geo_folder) + + elif self.supp_by == "series": + processed_series_files = [ + each_file["file_url"] for each_file in meta_processed_series + ] + for file_url in processed_series_files: + self._download_processed_file(file_url, data_geo_folder) + + def _expand_metadata_dict(self, metadata_dict: dict) -> dict: + """ + Expand all lists of all items in the dict by creating new items or joining them - elif self.supp_by == "series": - supp_series_path_meta = os.path.join( - self.metadata_raw, - "PEP_series", - self.project_name + EXP_SUPP_METADATA_FILE, - ) - self.write_processed_annotation( - processed_metadata_exp, supp_series_path_meta - ) + :param metadata_dict: metadata dict + :return: expanded metadata dict + """ + prj_list = _dict_to_list_converter(proj_dict=metadata_dict) + prj_list = self._expand_metadata_list(prj_list) + return _dict_to_list_converter(proj_list=prj_list) - # saving PEPs for raw data - else: - self.write_raw_annotation(metadata_dict, subannotation_dict) + def _expand_metadata_list(self, metadata_list: list) -> list: + """ + Expanding all lists of all items in the list by creating new items or joining them + + :param list metadata_list: list of dicts that store metadata + :return list: expanded metadata list + """ + self._LOGGER.info("Expanding metadata list...") + list_of_keys = _get_list_of_keys(metadata_list) + for key_in_list in list_of_keys: + metadata_list = self._expand_metadata_list_item(metadata_list, key_in_list) + return metadata_list - def expand_metadata_list(self, metadata_list, dict_key): + def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): """ - Expanding list items in the list by creating new items or joining them + Expand list of one element (item) in the list by creating new items or joining them + ["first1: fff", ...] -> separate columns :param list metadata_list: list of dicts that store metadata :param str dict_key: key in the dictionaries that have to be expanded - - :return str: path to file written + :return list: expanded metadata list """ try: element_is_list = any( - type(list_item[dict_key]) is list for list_item in metadata_list + type(list_item.get(dict_key)) is list for list_item in metadata_list ) if element_is_list: for n_elem in range(len(metadata_list)): - if type(metadata_list[n_elem][dict_key]) is not list: - metadata_list[n_elem][dict_key] = [ - metadata_list[n_elem][dict_key] - ] - - just_string = False - this_string = "" - for elem in metadata_list[n_elem][dict_key]: - separated_elements = elem.split(": ") - if len(separated_elements) >= 2: + try: + if type(metadata_list[n_elem][dict_key]) is not list: + metadata_list[n_elem][dict_key] = [ + metadata_list[n_elem][dict_key] + ] - # if first element is larger than 40 then treat it like simple string - if len(separated_elements[0]) > 40: - just_string = True - if this_string != "": - this_string = ", ".join([this_string, elem]) + just_string = False + this_string = "" + for elem in metadata_list[n_elem][dict_key]: + separated_elements = elem.split(": ") + if len(separated_elements) >= 2: + + # if first element is larger than 40 then treat it like simple string + if len(separated_elements[0]) > 40: + just_string = True + if this_string != "": + this_string = ", ".join([this_string, elem]) + else: + this_string = elem + # additional elem for all bed files + elif len(separated_elements[0].split("(")) > 1: + just_string = True + if this_string != "": + this_string = "(".join([this_string, elem]) + else: + this_string = elem else: - this_string = elem - # additional elem for all bed files - elif len(separated_elements[0].split("(")) > 1: + list_of_elem = [ + separated_elements[0], + ": ".join(separated_elements[1:]), + ] + sample_char = dict([list_of_elem]) + metadata_list[n_elem].update(sample_char) + else: just_string = True if this_string != "": - this_string = "(".join([this_string, elem]) + this_string = ", ".join([this_string, elem]) else: this_string = elem - else: - list_of_elem = [ - separated_elements[0], - ": ".join(separated_elements[1:]), - ] - sample_char = dict([list_of_elem]) - metadata_list[n_elem].update(sample_char) - else: - just_string = True - if this_string != "": - this_string = ", ".join([this_string, elem]) - else: - this_string = elem - if just_string: - metadata_list[n_elem][dict_key] = this_string - else: - del metadata_list[n_elem][dict_key] + if just_string: + metadata_list[n_elem][dict_key] = this_string + else: + del metadata_list[n_elem][dict_key] + except KeyError as err: + self._LOGGER.warning( + f"expand_metadata_list: Key Error: {err}, continuing ..." + ) return metadata_list else: self._LOGGER.debug( - "metadata with %s was not expanded, as item is not list" % dict_key + f"Metadata with {dict_key} was not expanded, as item is not list" ) return metadata_list except KeyError as err: - self._LOGGER.warning("Key Error: %s" % err) + self._LOGGER.warning(f"expand_metadata_list: Key Error: {err}") return metadata_list - except ValueError as err1: - self._LOGGER.warning("Value Error: %s" % err1) + except ValueError as err: + self._LOGGER.warning("expand_metadata_list: Value Error: {err}") return metadata_list - def filter_gsm(self, meta_processed_samples: list, gsm_list: dict) -> list: - """ - Getting metadata list of all samples of one experiment and filtering it - by the list of GSM that was specified in the input files. - And then changing names of the sample names. - - :param meta_processed_samples: list of metadata dicts of samples - :param gsm_list: list of dicts where GSM (samples) are keys and - sample names are values. Where values can be empty string - """ - - if gsm_list.keys(): - new_gsm_list = [] - for gsm_sample in meta_processed_samples: - if gsm_sample["Sample_geo_accession"] in gsm_list.keys(): - gsm_sample_new = gsm_sample - if gsm_list[gsm_sample["Sample_geo_accession"]] != "": - gsm_sample_new["sample_name"] = gsm_list[ - gsm_sample["Sample_geo_accession"] - ] - new_gsm_list.append(gsm_sample_new) - return new_gsm_list - return meta_processed_samples - - @staticmethod - def get_list_of_keys(list_of_dict): - """ - Getting list of all keys that are in the dictionaries in the list - - :param list list_of_dict: list of dicts with metadata - - :return list: list of dictionary keys - """ - - list_of_keys = [] - for element in list_of_dict: - list_of_keys.extend(list(element.keys())) - return list(set(list_of_keys)) - - def unify_list_keys(self, processed_meta_list): - """ - Unifying list of dicts with metadata, so every dict will have - same keys - - :param list processed_meta_list: list of dicts with metadata - - :return str: list of unified dicts with metadata - """ - list_of_keys = self.get_list_of_keys(processed_meta_list) - for k in list_of_keys: - for list_elem in range(len(processed_meta_list)): - if k not in processed_meta_list[list_elem]: - processed_meta_list[list_elem][k] = "" - return processed_meta_list - - def find_genome(self, metadata_list): - """ - Create new genome table by joining few columns - """ - list_keys = self.get_list_of_keys(metadata_list) - genome_keys = [ - "assembly", - "genome_build", - ] - proj_gen_keys = list(set(list_keys).intersection(genome_keys)) - - for sample in enumerate(metadata_list): - sample_genome = "" - for key in proj_gen_keys: - sample_genome = " ".join([sample_genome, sample[1][key]]) - metadata_list[sample[0]]["sample_genome"] = sample_genome - return metadata_list - - def write_gsm_annotation(self, gsm_metadata, file_annotation, use_key_subset=False): + def _write_gsm_annotation(self, gsm_metadata: dict, file_annotation: str) -> str: """ Write metadata sheet out as an annotation file. :param Mapping gsm_metadata: the data to write, parsed from a file with metadata/annotation information :param str file_annotation: the path to the file to write - :param bool use_key_subset: whether to use the keys present in the - metadata object given (False), or instead use a fixed set of keys - defined within this module (True) - :return str: path to file written + :return str: path to the file """ - if use_key_subset: - keys = ANNOTATION_SHEET_KEYS - else: - # keys = gsm_metadata[gsm_metadata.keys().next()].keys() - keys = list(list(gsm_metadata.values())[0].keys()) + keys = list(list(gsm_metadata.values())[0].keys()) - self._LOGGER.info(f"Sample annotation sheet: {file_annotation}") + self._LOGGER.info( + f"\033[92mSample annotation sheet: {file_annotation} . Saved!\033[0m" + ) fp = expandpath(file_annotation) - self._LOGGER.info(f"Writing: {fp}") with open(fp, "w") as of: w = csv.DictWriter(of, keys, extrasaction="ignore") w.writeheader() for item in gsm_metadata: w.writerow(gsm_metadata[item]) + self._LOGGER.info( + f"\033[92mSample annotation sheet: {file_annotation} . Saved!\033[0m" + ) + self._LOGGER.info("\033[92mFile has been saved successfully\033[0m") return fp - def write_processed_annotation(self, processed_metadata, file_annotation_path): + def _write_processed_annotation( + self, + processed_metadata: list, + file_annotation_path: str, + just_object: bool = False, + ) -> Union[NoReturn, peppy.Project]: """ - Saving annotation file by providing list of dictionaries with files metadata + Save annotation file by providing list of dictionaries with files metadata :param list processed_metadata: list of dictionaries with files metadata :param str file_annotation_path: the path to the metadata file that has to be saved + :type just_object: True, if you want to get peppy object without saving file + :return: none, or peppy project """ if len(processed_metadata) == 0: self._LOGGER.info( @@ -828,211 +947,270 @@ def write_processed_annotation(self, processed_metadata, file_annotation_path): os.makedirs(pep_file_folder) self._LOGGER.info("Unifying and saving of metadata... ") - processed_metadata = self.unify_list_keys(processed_metadata) + processed_metadata = _unify_list_keys(processed_metadata) # delete rare keys - processed_metadata = self.find_genome(processed_metadata) + processed_metadata = self._find_genome(processed_metadata) # filtering huge annotation strings that are repeating for each sample - processed_metadata, proj_meta = self.separate_common_meta( + processed_metadata, proj_meta = self._separate_common_meta( processed_metadata, self.const_limit_project, self.const_limit_discard, self.attr_limit_truncate, ) - meta_list_str = [ - f"{list(i.keys())[0]}: {list(i.values())[0]}" for i in proj_meta - ] - modifiers_str = "\n ".join(d for d in meta_list_str) - with open(file_annotation_path, "w") as m_file: - dict_writer = csv.DictWriter(m_file, processed_metadata[0].keys()) - dict_writer.writeheader() - dict_writer.writerows(processed_metadata) - self._LOGGER.info( - "\033[92mFile %s has been saved successfully\033[0m" % file_annotation_path - ) + template = self._create_config_processed(file_annotation_path, proj_meta) - geofetchdir = os.path.dirname(__file__) - config_template = os.path.join(geofetchdir, "config_processed_template.yaml") + if not just_object: + with open(file_annotation_path, "w") as m_file: + dict_writer = csv.DictWriter(m_file, processed_metadata[0].keys()) + dict_writer.writeheader() + dict_writer.writerows(processed_metadata) + self._LOGGER.info( + "\033[92mFile %s has been saved successfully\033[0m" + % file_annotation_path + ) - with open(config_template, "r") as template_file: - template = template_file.read() + # save .yaml file + yaml_name = os.path.split(file_annotation_path)[1][:-4] + ".yaml" + config = os.path.join(pep_file_folder, yaml_name) + self._write(config, template, msg_pre=" Config file: ") - template_values = { - "project_name": self.project_name, - "sample_table": os.path.basename(file_annotation_path), - "geo_folder": self.geo_folder, - "pipeline_samples": self.file_pipeline_samples, - "pipeline_project": self.file_pipeline_project, - "additional_columns": modifiers_str, - } + # save .pep.yaml file + if self.add_dotfile: + dot_yaml_path = os.path.join(pep_file_folder, ".pep.yaml") + _create_dot_yaml(dot_yaml_path, yaml_name) - for k, v in template_values.items(): - placeholder = "{" + str(k) + "}" - template = template.replace(placeholder, str(v)) + return None - # save .yaml file - yaml_name = os.path.split(file_annotation_path)[1][:-4] + ".yaml" - config = os.path.join(pep_file_folder, yaml_name) - self._write(config, template, msg_pre=" Config file: ") - - # save .pep.yaml file - if self.add_dotfile: - dot_yaml_path = os.path.join(pep_file_folder, ".pep.yaml") - self.create_dot_yaml(dot_yaml_path, yaml_name) + else: + pd_value = pd.DataFrame(processed_metadata) - return True + conf = yaml.load(template, Loader=yaml.Loader) + proj = peppy.Project().from_pandas(pd_value, config=conf) + return proj @staticmethod - def sanitize_name(name_str: str): + def _find_genome(metadata_list: list) -> list: """ - Function that sanitizing strings. (Replace all odd characters) - :param str name_str: Any string value that has to be sanitized. - :return: sanitized strings + Create new genome column by searching joining few columns + :param metadata_list: list with metadata dict + :return: list with metadata dict where genome column was added """ - new_str = name_str - for odd_char in list(punctuation): - new_str = new_str.replace(odd_char, "_") - new_str = new_str.replace(" ", "_").replace("__", "_") - return new_str + list_keys = _get_list_of_keys(metadata_list) + genome_keys = [ + "assembly", + "genome_build", + ] + proj_gen_keys = list(set(list_keys).intersection(genome_keys)) - def write_raw_annotation(self, metadata_dict, subannotation_dict): + for sample in enumerate(metadata_list): + sample_genome = "" + for key in proj_gen_keys: + sample_genome = " ".join([sample_genome, sample[1][key]]) + metadata_list[sample[0]][NEW_GENOME_COL_NAME] = sample_genome + return metadata_list + + def _write_raw_annotation_new( + self, name, metadata_dict: dict, subannot_dict: dict = None + ) -> Union[None, peppy.Project]: """ - Combining individual accessions into project-level annotations, and writeing + Combine individual accessions into project-level annotations, and writing individual accession files (if requested) - :param dict metadata_dict: dictionary of metadata - :param dict subannotation_dict: dictionary of sub-annotation metadata + :param name: Name of the run, project, or acc --> will influence name of the folder where project will be created + :param metadata_dict: dictionary of sample annotations + :param subannot_dict: dictionary of subsample annotations + :return: none or peppy object """ - - if self.discard_soft: - clean_soft_files(os.path.join(self.metadata_raw)) - try: assert len(metadata_dict) > 0 except AssertionError: self._LOGGER.warning( "\033[33mNo PEP created, as no raw data was found!!!\033[0m" ) - return False + return None - # checking sample_name value if it's not empty, - # otherwise pulling from title - for key, value in metadata_dict.items(): - fixed_dict = {} - for key_sample, value_sample in value.items(): - fixed_dict[key_sample] = value_sample - if ( - value_sample["sample_name"] == "" - or value_sample["sample_name"] is None - ): - fixed_dict[key_sample]["sample_name"] = value_sample["Sample_title"] + if self.discard_soft: + clean_soft_files(os.path.join(self.metadata_root_full)) - # sanitize sample names - fixed_dict[key_sample]["sample_name"] = self.sanitize_name( - fixed_dict[key_sample]["sample_name"] - ) + self._LOGGER.info( + "Creating complete project annotation sheets and config file..." + ) - metadata_dict[key] = fixed_dict + proj_root = os.path.join(self.metadata_root_full, name) + if not os.path.exists(proj_root): + os.makedirs(proj_root) - metadata_dict_combined = {} - for acc_GSE, gsm_metadata in metadata_dict.items(): - file_annotation = os.path.join( - self.metadata_expanded, acc_GSE + "_annotation.csv" - ) - if self.acc_anno: - self.write_gsm_annotation( - gsm_metadata, - file_annotation, - use_key_subset=self.use_key_subset, - ) - metadata_dict_combined.update(gsm_metadata) + proj_root_sample = os.path.join( + proj_root, f"{name}{FILE_RAW_NAME_SAMPLE_PATTERN}" + ) + proj_root_subsample = os.path.join( + proj_root, f"{name}{FILE_RAW_NAME_SUBSAMPLE_PATTERN}" + ) + yaml_name = f"{name}.yaml" + proj_root_yaml = os.path.join(proj_root, yaml_name) + dot_yaml_path = os.path.join(proj_root, ".pep.yaml") - # filtering huge annotation strings that are repeating for each sample - metadata_dict_combined, proj_meta = self.separate_common_meta( - metadata_dict_combined, + metadata_dict = self._check_sample_name_standard(metadata_dict) + + metadata_dict, proj_meta = self._separate_common_meta( + metadata_dict, self.const_limit_project, self.const_limit_discard, self.attr_limit_truncate, ) - meta_list_str = [ - f"{list(i.keys())[0]}: {list(i.values())[0]}" for i in proj_meta - ] - modifiers_str = "\n ".join(d for d in meta_list_str) - subannotation_dict_combined = {} - for acc_GSE, gsm_multi_table in subannotation_dict.items(): - file_subannotation = os.path.join( - self.metadata_expanded, acc_GSE + "_subannotation.csv" - ) - if self.acc_anno: - self.write_subannotation(gsm_multi_table, file_subannotation) - subannotation_dict_combined.update(gsm_multi_table) - self._LOGGER.info( - "Creating complete project annotation sheets and config file..." - ) - # If the project included more than one GSE, we can now output combined - # annotation tables for the entire project. - # Write combined annotation sheet - file_annotation = os.path.join( - self.metadata_raw, self.project_name + "_annotation.csv" - ) - self.write_gsm_annotation( - metadata_dict_combined, - file_annotation, - use_key_subset=self.use_key_subset, - ) # Write combined subannotation table - if len(subannotation_dict_combined) > 0: - file_subannotation = os.path.join( - self.metadata_raw, self.project_name + "_subannotation.csv" + if len(subannot_dict) > 0: + subanot_path_yaml = ( + f"subsample_table: {os.path.basename(proj_root_subsample)}" ) - self.write_subannotation(subannotation_dict_combined, file_subannotation) else: - file_subannotation = "null" + subanot_path_yaml = f"" + + template = self._create_config_raw( + proj_meta, proj_root_sample, subanot_path_yaml + ) + + if not self.just_object: + self._write_gsm_annotation(metadata_dict, proj_root_sample) + + if len(subannot_dict) > 0: + self._write_subannotation(subannot_dict, proj_root_subsample) + + self._write(proj_root_yaml, template, msg_pre=" Config file: ") + + if self.add_dotfile: + _create_dot_yaml(dot_yaml_path, yaml_name) + + else: + meta_df = pd.DataFrame.from_dict(metadata_dict, orient="index") + + # open list: + new_sub_list = [] + for sub_key in subannot_dict.keys(): + new_sub_list.extend([col_item for col_item in subannot_dict[sub_key]]) + + sub_meta_df = pd.DataFrame( + new_sub_list, columns=["sample_name", "SRX", "SRR"] + ) + + if sub_meta_df.empty: + sub_meta_df = None + else: + sub_meta_df = [sub_meta_df] + conf = yaml.load(template, Loader=yaml.Loader) + + proj = peppy.Project().from_pandas(meta_df, sub_meta_df, conf) + return proj + + def _create_config_processed( + self, file_annotation_path: str, proj_meta: list + ) -> str: + """ + Compose and generate config file content + :param file_annotation_path: root to the annotation file + :param proj_meta: common metadata that has to added to config file + :return: generated, complete config file content + """ + geofetchdir = os.path.dirname(__file__) + config_template = os.path.join(geofetchdir, CONFIG_PROCESSED_TEMPLATE_NAME) + with open(config_template, "r") as template_file: + template = template_file.read() + meta_list_str = [ + f'{list(i.keys())[0]}: "{_sanitize_config_string(list(i.values())[0])}"' + for i in proj_meta + ] + modifiers_str = "\n ".join(d for d in meta_list_str) + template_values = { + "project_name": self.project_name, + "sample_table": os.path.basename(file_annotation_path), + "geo_folder": self.geo_folder, + "pipeline_samples": self.file_pipeline_samples, + "pipeline_project": self.file_pipeline_project, + "additional_columns": modifiers_str, + } + for k, v in template_values.items(): + placeholder = "{" + str(k) + "}" + template = template.replace(placeholder, str(v)) + return template + + def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml): + """ + Compose and generate config file content for raw data + :param proj_meta: root to the annotation file + :param proj_root_sample: path to sampletable file + :param subanot_path_yaml: path to subannotation file + :return: generated, complete config file content + """ + meta_list_str = [ + f'{list(i.keys())[0]}: "{_sanitize_config_string(list(i.values())[0])}"' + for i in proj_meta + ] + modifiers_str = "\n ".join(d for d in meta_list_str) # Write project config file + geofetchdir = os.path.dirname(__file__) + + if self.file_pipeline_samples or modifiers_str != "": + sample_modifier_str = "sample_modifiers:\n append:" + else: + sample_modifier_str = "" if not self.config_template: - geofetchdir = os.path.dirname(__file__) - self.config_template = os.path.join(geofetchdir, "config_template.yaml") + self.config_template = os.path.join(geofetchdir, CONFIG_RAW_TEMPLATE_NAME) + if self.add_convert_modifier: + sra_convert_path = os.path.join(geofetchdir, CONFIG_SRA_TEMPLATE) + with open(sra_convert_path, "r") as template_file: + sra_convert_template = template_file.read() + else: + sra_convert_template = "" with open(self.config_template, "r") as template_file: template = template_file.read() - template_values = { "project_name": self.project_name, - "annotation": os.path.basename(file_annotation), - "subannotation": os.path.basename(file_subannotation), + "annotation": os.path.basename(proj_root_sample), + "subannotation": subanot_path_yaml, + "sample_modifier_str": sample_modifier_str, "pipeline_samples": self.file_pipeline_samples, "pipeline_project": self.file_pipeline_project, "additional_columns": modifiers_str, + "sra_convert": sra_convert_template, } for k, v in template_values.items(): placeholder = "{" + str(k) + "}" template = template.replace(placeholder, str(v)) - # save .yaml file - yaml_name = self.project_name + "_config.yaml" - config = os.path.join(self.metadata_raw, yaml_name) - self._write(config, template, msg_pre=" Config file: ") - - # save .pep.yaml file - if self.add_dotfile: - dot_yaml_path = os.path.join(self.metadata_raw, ".pep.yaml") - self.create_dot_yaml(dot_yaml_path, yaml_name) + return template @staticmethod - def create_dot_yaml(file_path: str, yaml_path: str): + def _check_sample_name_standard(metadata_dict: dict) -> dict: """ - Function that creates .pep.yaml file that points to actual yaml file - :param str file_path: Path to the .pep.yaml file that we want to create - :param str yaml_path: path or name of the actual yaml file + Standardize sample name and checking if it exists + (This function is used for raw data) + :param metadata_dict: metadata dict + :return: metadata dict with standardize sample names """ - with open(file_path, "w+") as file: - file.writelines(f"config_file: {yaml_path}") + fixed_dict = {} + for key_sample, value_sample in metadata_dict.items(): + fixed_dict[key_sample] = value_sample + if value_sample["sample_name"] == "" or value_sample["sample_name"] is None: + fixed_dict[key_sample]["sample_name"] = value_sample["Sample_title"] + # sanitize names + fixed_dict[key_sample]["sample_name"] = _sanitize_name( + fixed_dict[key_sample]["sample_name"] + ) + metadata_dict = fixed_dict + metadata_dict = _standardize_colnames(metadata_dict) + return metadata_dict - def separate_common_meta( - self, meta_list, max_len=50, del_limit=250, attr_limit_truncate=500 - ): + @staticmethod + def _separate_common_meta( + meta_list: Union[List, Dict], + max_len: int = 50, + del_limit: int = 250, + attr_limit_truncate: int = 500, + ) -> tuple: """ - This function is separating information for the experiment from a sample + Separate experiment(project) metadata from sample metadata :param list or dict meta_list: list of dictionaries of samples :param int max_len: threshold of the length of the common value that can be stored in the sample table :param int del_limit: threshold of the length of the common value that have to be deleted @@ -1041,18 +1219,13 @@ def separate_common_meta( list of samples metadata dictionaries and 2: list of common samples metadata dictionaries that are linked to the project. """ + # check if meta_list is dict and converting it to list input_is_dict = False if isinstance(meta_list, dict): input_is_dict = True - new_meta_list = [] - for key in meta_list: - new_dict = meta_list[key] - new_dict["big_key"] = key - new_meta_list.append(new_dict) - - meta_list = new_meta_list + meta_list = _dict_to_list_converter(proj_dict=meta_list) - list_of_keys = self.get_list_of_keys(meta_list) + list_of_keys = _get_list_of_keys(meta_list) list_keys_diff = [] # finding columns with common values for this_key in list_of_keys: @@ -1082,9 +1255,11 @@ def separate_common_meta( if this_key not in list_keys_diff: if first_key: if len(str(nb_sample[1][this_key])) <= del_limit: - new_meta_project.append( - {this_key: nb_sample[1][this_key]} - ) + new_str = nb_sample[1][this_key] + if isinstance(nb_sample[1][this_key], str): + new_str = nb_sample[1][this_key].replace('"', "") + # new_str = re.sub("[^A-Za-z0-9]+", " ", new_str) + new_meta_project.append({this_key: new_str}) first_key = False del meta_list[nb_sample[0]][this_key] except KeyError: @@ -1104,38 +1279,12 @@ def separate_common_meta( meta_list = new_list if input_is_dict: - new_sample_dict = {} - for sample in meta_list: - new_sample_dict[sample["big_key"]] = sample - meta_list = new_sample_dict - + meta_list = _dict_to_list_converter(proj_list=meta_list) return meta_list, new_meta_project - def standardize_colnames(self, meta_list): - """ - Standardize column names by lower-casing and underscore - :param list meta_list: list of dictionaries of samples - :return : list of dictionaries of samples with standard colnames + def _download_SRA_file(self, run_name: str): """ - new_metalist = [] - list_keys = self.get_list_of_keys(meta_list) - for item_nb, values in enumerate(meta_list): - new_metalist.append({}) - for key in list_keys: - try: - new_key_name = key.lower().strip() - new_key_name = self.sanitize_name(new_key_name) - - new_metalist[item_nb][new_key_name] = values[key] - - except KeyError: - pass - - return new_metalist - - def download_SRA_file(self, run_name): - """ - Downloading SRA file by ising 'prefetch' utility from the SRA Toolkit + Download SRA file by ising 'prefetch' utility from the SRA Toolkit more info: (http://www.ncbi.nlm.nih.gov/books/NBK242621/) :param str run_name: SRR number of the SRA file """ @@ -1161,30 +1310,9 @@ def download_SRA_file(self, run_name): ) time.sleep(t * 2) - @staticmethod - def which(program): + def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn: """ - return str: the path to a program to make sure it exists - """ - import os - - def is_exe(fp): - return os.path.isfile(fp) and os.access(fp, os.X_OK) - - fpath, fname = os.path.split(program) - if fpath: - if is_exe(program): - return program - else: - for path in os.environ["PATH"].split(os.pathsep): - path = path.strip('"') - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - - def sra_bam_conversion(self, bam_file, run_name): - """ - Converting of SRA file to BAM file by using samtools function "sam-dump" + Convert SRA file to BAM file by using samtools function "sam-dump" :param str bam_file: path to BAM file that has to be created :param str run_name: SRR number of the SRA file that has to be converted """ @@ -1206,54 +1334,11 @@ def sra_bam_conversion(self, bam_file, run_name): self._LOGGER.info(f"Conversion command: {cmd}") run_subprocess(cmd, shell=True) - @staticmethod - def update_columns(metadata, experiment_name, sample_name, read_type): - """ - Update the metadata associated with a particular experiment. - - For the experiment indicated, this function updates the value (mapping), - including new data and populating columns used by looper based on - existing values in the mapping. - - :param Mapping metadata: the key-value mapping to update - :param str experiment_name: name of the experiment from which these - data came and are associated; the key in the metadata mapping - for which the value is to be updated - :param str sample_name: name of the sample with which these data are - associated - :param str read_type: usually "single" or "paired," an indication of the - type of sequencing reads for this experiment - :return Mapping: - """ - - exp = metadata[experiment_name] - - # Protocol-agnostic - exp["sample_name"] = sample_name - exp["protocol"] = exp["Sample_library_selection"] - exp["read_type"] = read_type - exp["organism"] = exp["Sample_organism_ch1"] - exp["data_source"] = "SRA" - exp["SRX"] = experiment_name - - # Protocol specified is lowercased prior to checking here to alleviate - # dependence on case for the value in the annotations file. - bisulfite_protocols = {"reduced representation": "RRBS", "random": "WGBS"} - - # Conditional on bisulfite sequencing - # print(":" + exp["Sample_library_strategy"] + ":") - # Try to be smart about some library methods, refining protocol if possible. - if exp["Sample_library_strategy"] == "Bisulfite-Seq": - # print("Parsing protocol") - proto = exp["Sample_library_selection"].lower() - if proto in bisulfite_protocols: - exp["protocol"] = bisulfite_protocols[proto] - - return exp - - def sra_bam_conversion2(self, bam_file, run_name, picard_path=None): + def _sra_to_bam_conversion_fastq_damp( + self, bam_file: str, run_name: str, picard_path: str = None + ) -> NoReturn: """ - Converting of SRA file to BAM file by using fastq-dump + Convert SRA file to BAM file by using fastq-dump (is used when sam-dump fails, yielding an empty bam file. Here fastq -> bam conversion is used) :param str bam_file: path to BAM file that has to be created :param str run_name: SRR number of the SRA file that has to be converted @@ -1290,9 +1375,11 @@ def sra_bam_conversion2(self, bam_file, run_name, picard_path=None): self._LOGGER.info(f"Conversion command: {cmd}") run_subprocess(cmd, shell=True) - def write_subannotation(self, tabular_data, filepath, column_names=None): + def _write_subannotation( + self, tabular_data: dict, filepath: str, column_names: list = None + ): """ - Writes one or more tables to a given CSV filepath. + Write one or more tables to a given CSV filepath. :param tabular_data: Mapping | Iterable[Mapping]: single KV pair collection, or collection of such collections, to write to disk as tabular data @@ -1317,9 +1404,11 @@ def write_subannotation(self, tabular_data, filepath, column_names=None): writer.writerows(values) return fp - def download_file(self, file_url, data_folder, new_name=None, sleep_after=0.5): + def _download_file( + self, file_url: str, data_folder: str, new_name: str = None, sleep_after=0.5 + ) -> NoReturn: """ - Given an url for a file, downloading to specified folder + Given an url for a file, downloading file to specified folder :param str file_url: the URL of the file to download :param str data_folder: path to the folder where data should be downloaded :param float sleep_after: time to sleep after downloading @@ -1345,21 +1434,23 @@ def download_file(self, file_url, data_folder, new_name=None, sleep_after=0.5): else: self._LOGGER.info(f"\033[38;5;242mFile {full_filepath} exists.\033[0m") - def get_list_of_processed_files(self, file_gse, file_gsm): + def _get_list_of_processed_files( + self, file_gse_content: list, file_gsm_content: list + ) -> tuple: """ Given a paths to GSE and GSM metafile create a list of dicts of metadata of processed files - :param str file_gse: the path to gse metafile - :param str file_gsm: the path to gse metafile - :return list: list of metadata of processed files + :param list file_gse_content: list of lines of gse metafile + :param list file_gsm_content: list of lines of gse metafile + :return: tuple[list of metadata of processed sample files and series files] """ tar_re = re.compile(r".*\.tar$") gse_numb = None meta_processed_samples = [] meta_processed_series = {"GSE": "", "files": []} - for line in open(file_gse, "r"): + for line in file_gse_content: if re.compile(r"!Series_geo_accession").search(line): - gse_numb = self.get_value(line) + gse_numb = _get_value(line) meta_processed_series["GSE"] = gse_numb found = re.findall(SER_SUPP_FILE_PATTERN, line) @@ -1373,21 +1464,40 @@ def get_list_of_processed_files(self, file_gse, file_gsm): if tar_re.search(filename): # find and download filelist - file with information about files in tar index = file_url.rfind("/") - tar_files_list_url = file_url[: index + 1] + "filelist.txt" + tar_files_list_url = ( + "https" + file_url[3 : index + 1] + "filelist.txt" + ) # file_list_name filelist_path = os.path.join( self.metadata_expanded, gse_numb + "_file_list.txt" ) - self.download_file( - tar_files_list_url, - self.metadata_expanded, - gse_numb + "_file_list.txt", - ) + + # TODO: make new function of code below: + if not os.path.isfile(filelist_path) or self.refresh_metadata: + result = requests.get(tar_files_list_url) + if result.ok: + result.encoding = "UTF-8" + filelist_raw_text = result.text + if not self.discard_soft: + try: + with open(filelist_path, "w") as f: + f.write(filelist_raw_text) + except OSError: + self._LOGGER.warning( + f"{filelist_path} not found. File won't be saved.." + ) + + else: + raise Exception(f"error in requesting tar_files_list") + else: + self._LOGGER.info(f"Found previous GSM file: {filelist_path}") + filelist_obj = open(filelist_path, "r") + filelist_raw_text = filelist_obj.read() nb = len(meta_processed_samples) - 1 - for line_gsm in open(file_gsm, "r"): + for line_gsm in file_gsm_content: if line_gsm[0] == "^": - nb = len(self.check_file_existance(meta_processed_samples)) + nb = len(_check_file_existance(meta_processed_samples)) meta_processed_samples.append( {"files": [], "GSE": gse_numb} ) @@ -1431,21 +1541,19 @@ def get_list_of_processed_files(self, file_gse, file_gsm): if file_url_gsm != "NONE": meta_processed_samples[nb]["files"].append(file_url_gsm) - self.check_file_existance(meta_processed_samples) - meta_processed_samples = self.separate_list_of_files( - meta_processed_samples - ) - meta_processed_samples = self.separate_file_url( + _check_file_existance(meta_processed_samples) + meta_processed_samples = _separate_list_of_files( meta_processed_samples ) + meta_processed_samples = _separate_file_url(meta_processed_samples) self._LOGGER.info( - f"Total number of processed SAMPLES files found is: " + f"\nTotal number of processed SAMPLES files found is: " f"%s" % str(len(meta_processed_samples)) ) # expand meta_processed_samples with information about type and size - file_info_add = self.read_tar_filelist(filelist_path) + file_info_add = _read_tar_filelist(filelist_raw_text) for index_nr in range(len(meta_processed_samples)): file_name = meta_processed_samples[index_nr]["file"] meta_processed_samples[index_nr].update( @@ -1453,9 +1561,11 @@ def get_list_of_processed_files(self, file_gse, file_gsm): ) if self.filter_re: - meta_processed_samples = self.run_filter(meta_processed_samples) + meta_processed_samples = self._run_filter( + meta_processed_samples + ) if self.filter_size: - meta_processed_samples = self.run_size_filter( + meta_processed_samples = self._run_size_filter( meta_processed_samples ) @@ -1482,80 +1592,23 @@ def get_list_of_processed_files(self, file_gse, file_gsm): f"IndexError in adding value to meta_processed_series: %s" % ind_err ) - meta_processed_series = self.separate_list_of_files(meta_processed_series) - meta_processed_series = self.separate_file_url(meta_processed_series) + meta_processed_series = _separate_list_of_files(meta_processed_series) + meta_processed_series = _separate_file_url(meta_processed_series) self._LOGGER.info( f"Total number of processed SERIES files found is: " f"%s" % str(len(meta_processed_series)) ) if self.filter_re: - meta_processed_series = self.run_filter(meta_processed_series) + meta_processed_series = self._run_filter(meta_processed_series) return meta_processed_samples, meta_processed_series - @staticmethod - def check_file_existance(meta_processed_sample): - """ - Checking if last element of the list has files. If list of files is empty deleting it - """ - nb = len(meta_processed_sample) - 1 - if nb > -1: - if len(meta_processed_sample[nb]["files"]) == 0: - del meta_processed_sample[nb] - nb -= 1 - return meta_processed_sample - - @staticmethod - def separate_list_of_files(meta_list, col_name="files"): + def _run_filter(self, meta_list: list, col_name: str = "file") -> list: """ - This method is separating list of files (dict value) or just simple dict - into two different dicts - """ - separated_list = [] - if type(meta_list) == list: - for meta_elem in meta_list: - for file_elem in meta_elem[col_name]: - new_dict = meta_elem.copy() - new_dict.pop(col_name, None) - new_dict["file"] = file_elem - separated_list.append(new_dict) - elif type(meta_list) == dict: - for file_elem in meta_list[col_name]: - new_dict = meta_list.copy() - new_dict.pop(col_name, None) - new_dict["file"] = file_elem - separated_list.append(new_dict) - else: - return TypeError("Incorrect type") - - return separated_list - - def separate_file_url(self, meta_list): - """ - This method is adding dict key without file_name without path - """ - separated_list = [] - for meta_elem in meta_list: - new_dict = meta_elem.copy() - new_dict["file_url"] = meta_elem["file"] - new_dict["file"] = os.path.basename(meta_elem["file"]) - # new_dict["sample_name"] = os.path.basename(meta_elem["file"]) - try: - new_dict["sample_name"] = str(meta_elem["Sample_title"]) - if new_dict["sample_name"] == "" or new_dict["sample_name"] is None: - raise KeyError("sample_name Does not exist. Creating .. ") - except KeyError: - new_dict["sample_name"] = os.path.basename(meta_elem["file"]) - - # sanitize sample names - new_dict["sample_name"] = self.sanitize_name(new_dict["sample_name"]) - - separated_list.append(new_dict) - return separated_list - - def run_filter(self, meta_list, col_name="file"): - """ - If user specified filter it will filter all this files here by col_name + Filters files and metadata using Regular expression filter + :param meta_list: list of composed metadata + :param col_name: name of the column where file names are stored + :return: metadata list after file_name filter """ filtered_list = [] for meta_elem in meta_list: @@ -1568,9 +1621,12 @@ def run_filter(self, meta_list, col_name="file"): return filtered_list - def run_size_filter(self, meta_list, col_name="file_size"): + def _run_size_filter(self, meta_list, col_name="file_size"): """ - function for filtering file size + Filters files and metadata by file size column specified in meta_list + :param meta_list: list of composed metadata + :param col_name: name of the column where is size information stored + :return: metadata list after size filter """ if self.filter_size is not None: filtered_list = [] @@ -1588,51 +1644,13 @@ def run_size_filter(self, meta_list, col_name="file_size"): ) return filtered_list - @staticmethod - def read_tar_filelist(file_path): - """ - Creating list for supplementary files that are listed in "filelist.txt" - :param str file_path: path to the file with information about files that are zipped ("filelist.txt") - :return dict: dict of supplementary file names and additional information - """ - - files_info = {} - with open(file_path, newline="") as csvfile: - csv_reader = csv.reader(csvfile, delimiter="\t") - line_count = 0 - for row in csv_reader: - if line_count == 0: - name_index = row.index("Name") - size_index = row.index("Size") - type_index = row.index("Type") - - line_count += 1 - else: - files_info[row[name_index]] = { - "file_size": row[size_index], - "type": row[type_index], - } - - return files_info - - @staticmethod - def get_value(all_line): - line_value = all_line.split("= ")[-1] - return line_value.split(": ")[-1].rstrip("\n") - - def download_processed_file(self, file_url, data_folder): - + def _download_processed_file(self, file_url: str, data_folder: str) -> bool: """ Given a url for a file, download it, and extract anything passing the filter. :param str file_url: the URL of the file to download :param str data_folder: the local folder where the file should be saved :return bool: True if the file is downloaded successfully; false if it does not pass filters and is not downloaded. - - # :param re.Pattern tar_re: a regulator expression (produced from re.compile) - # that pulls out filenames with .tar in them --- deleted - # :param re.Pattern filter_re: a regular expression (produced from - # re.compile) to filter filenames of interest. """ if not self.geo_folder: @@ -1646,7 +1664,7 @@ def download_processed_file(self, file_url, data_folder): while ntry < 10: try: - self.download_file(file_url, data_folder) + self._download_file(file_url, data_folder) self._LOGGER.info( "\033[92mFile %s has been downloaded successfully\033[0m" % f"{data_folder}/{filename}" @@ -1664,16 +1682,16 @@ def download_processed_file(self, file_url, data_folder): if ntry > 4: raise e - def get_SRA_meta(self, file_gse, gsm_metadata, file_sra=None): + def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): """ Parse out the SRA project identifier from the GSE file - :param str file_gse: full path to GSE.soft metafile + :param list file_gse_content: list of content of file_sde_content :param dict gsm_metadata: dict of GSM metadata :param str file_sra: full path to SRA.csv metafile that has to be downloaded """ # acc_SRP = None - for line in open(file_gse, "r"): + for line in file_gse_content: found = re.findall(PROJECT_PATTERN, line) if found: acc_SRP = found[0] @@ -1685,20 +1703,20 @@ def get_SRA_meta(self, file_gse, gsm_metadata, file_sra=None): # as part of this GEO submission. Can't proceed. self._LOGGER.warning( "\033[91mUnable to get SRA accession (SRP#) from GEO GSE SOFT file. " - "No raw data?\033[0m" + "No raw data detected! Continuing anyway...\033[0m" ) # but wait; another possibility: there's no SRP linked to the GSE, but there # could still be an SRX linked to the (each) GSM. if len(gsm_metadata) == 1: try: - acc_SRP = gsm_metadata.keys()[0] + acc_SRP = list(gsm_metadata.keys())[0] self._LOGGER.warning( "But the GSM has an SRX number; instead of an " "SRP, using SRX identifier for this sample: " + acc_SRP ) except TypeError: self._LOGGER.warning("Error in gsm_metadata") - return False + return [] # else: # # More than one sample? not sure what to do here. Does this even happen? @@ -1710,8 +1728,9 @@ def get_SRA_meta(self, file_gse, gsm_metadata, file_sra=None): if not os.path.isfile(file_sra) or self.refresh_metadata: try: # downloading metadata - srp_list = self.get_SRP_list(acc_SRP) - if file_sra is not None: + srp_list = self._get_SRP_list(acc_SRP) + srp_list = _unify_list_keys(srp_list) + if file_sra is not None and not self.discard_soft: with open(file_sra, "w") as m_file: dict_writer = csv.DictWriter(m_file, srp_list[0].keys()) dict_writer.writeheader() @@ -1724,7 +1743,7 @@ def get_SRA_meta(self, file_gse, gsm_metadata, file_sra=None): f"\033[91mError occurred, while downloading SRA Info Metadata of {acc_SRP}. " f"Error: {err} \033[0m" ) - return False + return [] else: # open existing annotation self._LOGGER.info(f"Found SRA metadata, opening..") @@ -1740,7 +1759,7 @@ def get_SRA_meta(self, file_gse, gsm_metadata, file_sra=None): return srp_list else: try: - srp_list = self.get_SRP_list(acc_SRP) + srp_list = self._get_SRP_list(acc_SRP) return srp_list except Exception as err: @@ -1748,14 +1767,17 @@ def get_SRA_meta(self, file_gse, gsm_metadata, file_sra=None): f"\033[91mError occurred, while downloading SRA Info Metadata of {acc_SRP}. " f"Error: {err} \033[0m" ) - return False + return [] - def get_SRP_list(self, srp_number: str) -> list: + def _get_SRP_list(self, srp_number: str) -> list: """ - By using requests and xml searching and getting list of dicts of SRRs + Get a list of srp by using requests and xml searching and getting list of dicts of SRRs :param str srp_number: SRP number :return: list of dicts of SRRs """ + if not srp_number: + self._LOGGER.info(f"No srp number in this accession found") + return [] self._LOGGER.info(f"Downloading {srp_number} sra metadata") ncbi_esearch = NCBI_ESEARCH.format(SRP_NUMBER=srp_number) @@ -1763,30 +1785,42 @@ def get_SRP_list(self, srp_number: str) -> list: x = requests.post(ncbi_esearch) if x.status_code != 200: + x.encoding = "UTF-8" self._LOGGER.error(f"Error in ncbi esearch response: {x.status_code}") raise x.raise_for_status() - id_results = x.json()["esearchresult"]["idlist"] + if len(id_results) > 500: + id_results = [ + id_results[x : x + 100] for x in range(0, len(id_results), 100) + ] + else: + id_results = [id_results] - id_r_string = ",".join(id_results) - id_api = NCBI_EFETCH.format(ID=id_r_string) - y = requests.get(id_api) - if y.status_code != 200: - self._LOGGER.error(f"Error in ncbi efetch response: {x.status_code}") - raise y.raise_for_status() + SRP_list = [] + for result in id_results: + id_r_string = ",".join(result) + id_api = NCBI_EFETCH.format(ID=id_r_string) - xml_result = y.text - SRP_list = xmltodict.parse(xml_result)["SraRunInfo"]["Row"] + y = requests.get(id_api) + if y.status_code != 200: + self._LOGGER.error( + f"Error in ncbi efetch response in SRA fetching: {x.status_code}" + ) + raise y.raise_for_status() + xml_result = y.text + SRP_list.extend(xmltodict.parse(xml_result)["SraRunInfo"]["Row"]) return SRP_list - def get_gsm_metadata(self, acc_GSE, acc_GSE_list, file_gsm): + def _read_gsm_metadata( + self, acc_GSE: str, acc_GSE_list: dict, file_gsm_content: list + ) -> dict: """ A simple state machine to parse SOFT formatted files (Here, the GSM file) :param str acc_GSE: GSE number (Series accession) :param dict acc_GSE_list: list of GSE - :param str file_gsm: full path to GSM.soft metafile + :param list file_gsm_content: list of contents of gsm file :return dict: dictionary of experiment information (gsm_metadata) """ gsm_metadata = {} @@ -1798,7 +1832,7 @@ def get_gsm_metadata(self, acc_GSE, acc_GSE_list, file_gsm): current_sample_id = None current_sample_srx = False samples_list = [] - for line in open(file_gsm, "r"): + for line in file_gsm_content: line = line.rstrip() if len(line) == 0: # Apparently SOFT files can contain blank lines continue @@ -1834,7 +1868,17 @@ def get_gsm_metadata(self, acc_GSE, acc_GSE_list, file_gsm): f"line: {line}" ) continue - gsm_metadata[current_sample_id].update(pl) + new_key = list(pl.keys())[0] + if new_key in gsm_metadata[current_sample_id]: + if isinstance(gsm_metadata[current_sample_id][new_key], list): + gsm_metadata[current_sample_id][new_key].append(pl[new_key]) + else: + gsm_metadata[current_sample_id][new_key] = [ + gsm_metadata[current_sample_id][new_key] + ] + gsm_metadata[current_sample_id][new_key].append(pl[new_key]) + else: + gsm_metadata[current_sample_id].update(pl) # Now convert the ids GEO accessions into SRX accessions if not current_sample_srx: @@ -1850,9 +1894,23 @@ def get_gsm_metadata(self, acc_GSE, acc_GSE_list, file_gsm): current_sample_srx = True # GSM SOFT file parsed, save it in a list self._LOGGER.info(f"Processed {len(samples_list)} samples.") + gsm_metadata = self._expand_metadata_dict(gsm_metadata) return gsm_metadata - def _write(self, f_var_value, content, msg_pre=None, omit_newline=False): + def _write( + self, + f_var_value: str, + content: str, + msg_pre: str = None, + omit_newline: bool = False, + ): + """ + Save new file (used for config file) + :param f_var_value: path to the file + :param content: content of the file + :param msg_pre: msg that have to be printed + :param omit_newline: omit new line + """ fp = expandpath(f_var_value) self._LOGGER.info((msg_pre or "") + fp) with open(fp, "w") as f: @@ -1861,274 +1919,6 @@ def _write(self, f_var_value, content, msg_pre=None, omit_newline=False): f.write("\n") -def _parse_cmdl(cmdl): - parser = argparse.ArgumentParser( - description="Automatic GEO and SRA data downloader" - ) - - processed_group = parser.add_argument_group("processed") - raw_group = parser.add_argument_group("raw") - - parser.add_argument( - "-V", "--version", action="version", version=f"%(prog)s {__version__}" - ) - - # Required - parser.add_argument( - "-i", - "--input", - dest="input", - required=True, - help="required: a GEO (GSE) accession, or a file with a list of GSE numbers", - ) - - # Optional - parser.add_argument( - "-n", "--name", help="Specify a project name. Defaults to GSE number" - ) - - parser.add_argument( - "-m", - "--metadata-root", - dest="metadata_root", - default=safe_echo("SRAMETA"), - help="Specify a parent folder location to store metadata. " - "The project name will be added as a subfolder " - "[Default: $SRAMETA:" + safe_echo("SRAMETA") + "]", - ) - - parser.add_argument( - "-u", - "--metadata-folder", - help="Specify an absolute folder location to store metadata. " - "No subfolder will be added. Overrides value of --metadata-root " - "[Default: Not used (--metadata-root is used by default)]", - ) - - parser.add_argument( - "--just-metadata", - action="store_true", - help="If set, don't actually run downloads, just create metadata", - ) - - parser.add_argument( - "-r", - "--refresh-metadata", - action="store_true", - help="If set, re-download metadata even if it exists.", - ) - - parser.add_argument( - "--config-template", default=None, help="Project config yaml file template." - ) - - # Optional - parser.add_argument( - "--pipeline-samples", - default=None, - help="Optional: Specify one or more filepaths to SAMPLES pipeline interface yaml files. " - "These will be added to the project config file to make it immediately " - "compatible with looper. [Default: null]", - ) - - # Optional - parser.add_argument( - "--pipeline-project", - default=None, - help="Optional: Specify one or more filepaths to PROJECT pipeline interface yaml files. " - "These will be added to the project config file to make it immediately " - "compatible with looper. [Default: null]", - ) - - # Optional - parser.add_argument( - "-k", - "--skip", - default=0, - type=int, - help="Skip some accessions. [Default: no skip].", - ) - - parser.add_argument( - "--acc-anno", - action="store_true", - help="Optional: Produce annotation sheets for each accession." - " Project combined PEP for the whole project won't be produced.", - ) - - parser.add_argument( - "--discard-soft", - action="store_true", - help="Optional: After creation of PEP files, all soft and additional files will be deleted", - ) - - parser.add_argument( - "--const-limit-project", - type=int, - default=50, - help="Optional: Limit of the number of the constant sample characters " - "that should not be in project yaml. [Default: 50]", - ) - - parser.add_argument( - "--const-limit-discard", - type=int, - default=250, - help="Optional: Limit of the number of the constant sample characters " - "that should not be discarded [Default: 250]", - ) - - parser.add_argument( - "--attr-limit-truncate", - type=int, - default=500, - help="Optional: Limit of the number of sample characters." - "Any attribute with more than X characters will truncate to the first X," - " where X is a number of characters [Default: 500]", - ) - - parser.add_argument( - "--add-dotfile", - action="store_true", - help="Optional: Add .pep.yaml file that points .yaml PEP file", - ) - - processed_group.add_argument( - "-p", - "--processed", - default=False, - action="store_true", - help="Download processed data [Default: download raw data].", - ) - - processed_group.add_argument( - "--data-source", - dest="data_source", - choices=["all", "samples", "series"], - default="samples", - help="Optional: Specifies the source of data on the GEO record" - " to retrieve processed data, which may be attached to the" - " collective series entity, or to individual samples. " - "Allowable values are: samples, series or both (all). " - "Ignored unless 'processed' flag is set. [Default: samples]", - ) - - processed_group.add_argument( - "--filter", - default=None, - help="Optional: Filter regex for processed filenames [Default: None]." - "Ignored unless 'processed' flag is set.", - ) - - processed_group.add_argument( - "--filter-size", - dest="filter_size", - default=None, - help="""Optional: Filter size for processed files - that are stored as sample repository [Default: None]. - Works only for sample data. - Supported input formats : 12B, 12KB, 12MB, 12GB. - Ignored unless 'processed' flag is set.""", - ) - - processed_group.add_argument( - "-g", - "--geo-folder", - default=safe_echo("GEODATA"), - help="Optional: Specify a location to store processed GEO files." - " Ignored unless 'processed' flag is set." - "[Default: $GEODATA:" + safe_echo("GEODATA") + "]", - ) - - raw_group.add_argument( - "-x", - "--split-experiments", - action="store_true", - help="""Split SRR runs into individual samples. By default, SRX - experiments with multiple SRR Runs will have a single entry in the - annotation table, with each run as a separate row in the - subannotation table. This setting instead treats each run as a - separate sample""", - ) - - raw_group.add_argument( - "-b", - "--bam-folder", - dest="bam_folder", - default=safe_echo("SRABAM"), - help="""Optional: Specify folder of bam files. Geofetch will not - download sra files when corresponding bam files already exist. - [Default: $SRABAM:""" - + safe_echo("SRABAM") - + "]", - ) - - raw_group.add_argument( - "-f", - "--fq-folder", - dest="fq_folder", - default=safe_echo("SRAFQ"), - help="""Optional: Specify folder of fastq files. Geofetch will not - download sra files when corresponding fastq files already exist. - [Default: $SRAFQ:""" - + safe_echo("SRAFQ") - + "]", - ) - - # Deprecated; these are for bam conversion which now happens in sra_convert - # it still works here but I hide it so people don't use it, because it's confusing. - raw_group.add_argument( - "-s", - "--sra-folder", - dest="sra_folder", - default=safe_echo("SRARAW"), - help=argparse.SUPPRESS, - # help="Optional: Specify a location to store sra files " - # "[Default: $SRARAW:" + safe_echo("SRARAW") + "]" - ) - raw_group.add_argument( - "--bam-conversion", - action="store_true", - # help="Turn on sequential bam conversion. Default: No conversion.", - help=argparse.SUPPRESS, - ) - - raw_group.add_argument( - "--picard-path", - dest="picard_path", - default=safe_echo("PICARD"), - # help="Specify a path to the picard jar, if you want to convert " - # "fastq to bam [Default: $PICARD:" + safe_echo("PICARD") + "]", - help=argparse.SUPPRESS, - ) - - raw_group.add_argument( - "--use-key-subset", - action="store_true", - help="Use just the keys defined in this module when writing out metadata.", - ) - - logmuse.add_logging_options(parser) - return parser.parse_args(cmdl) - - -def safe_echo(var): - """Returns an environment variable if it exists, or an empty string if not""" - return os.getenv(var, "") - - -class InvalidSoftLineException(Exception): - """Exception related to parsing SOFT line.""" - - def __init__(self, l): - """ - Create the exception by providing the problematic line. - - :param str l: the problematic SOFT line - """ - super(self, f"{l}") - - def main(): """Run the script.""" args = _parse_cmdl(sys.argv[1:]) diff --git a/geofetch/looper_sra_convert.yaml b/geofetch/looper_sra_convert.yaml new file mode 100644 index 0000000..bf5905d --- /dev/null +++ b/geofetch/looper_sra_convert.yaml @@ -0,0 +1,45 @@ + # Adding sra convert looper pipeline + SRR_files: SRA + + derive: + attributes: [read1, read2, SRR_files] + sources: + SRA: "${SRABAM}/{SRR}.bam" + FQ: "${SRAFQ}/{SRR}.fastq.gz" + FQ1: "${SRAFQ}/{SRR}_1.fastq.gz" + FQ2: "${SRAFQ}/{SRR}_2.fastq.gz" + imply: + - if: + organism: "Mus musculus" + then: + genome: mm10 + - if: + organism: "Homo sapiens" + then: + genome: hg38 + - if: + read_type: "PAIRED" + then: + read1: FQ1 + read2: FQ2 + - if: + read_type: "SINGLE" + then: + read1: FQ1 + +project_modifiers: + amend: + sra_convert: + looper: + results_subdir: sra_convert_results + sample_modifiers: + append: + SRR_files: SRA + pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml + derive: + attributes: [read1, read2, SRR_files] + sources: + SRA: "${SRARAW}/{SRR}.sra" + FQ: "${SRAFQ}/{SRR}.fastq.gz" + FQ1: "${SRAFQ}/{SRR}_1.fastq.gz" + FQ2: "${SRAFQ}/{SRR}_2.fastq.gz" diff --git a/geofetch/utils.py b/geofetch/utils.py index 7835196..b8e4e08 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -5,21 +5,13 @@ import subprocess import sys import re - - -__author__ = [ - "Oleksandr Khoroshevskyi", - "Vince Reuter", - "Nathan Sheffield", -] -__email__ = "bnt4me@virginia.edu" - -__all__ = ["parse_accessions"] - +import requests +from io import StringIO +import csv +from typing import NoReturn, Dict, List, Union _LOGGER = logging.getLogger(__name__) - # This dict provides NCBI lookup URLs for different accession types. SRX # identifiers can be used to grab metadata from SRA for a single sample, just as # an SRP identifier is used to grab the same table for multiple samples, so @@ -30,7 +22,7 @@ } -def is_known_type(accn=None, typename=None): +def is_known_type(accn: str = None, typename: str = None): """ Determine if the given accession is of a known type. @@ -60,7 +52,7 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False): interested in from that GSE#. An empty sample list means we should get all samples from that GSE#. This loop will create this dict. - :param input_arg: + :param input_arg: Input argument (GSE, or file) :param str metadata_folder: path to folder for accession metadata :param bool just_metadata: whether to only process metadata, not the actual data associated with the accession @@ -130,13 +122,12 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False): return acc_GSE_list -def parse_SOFT_line(l): +def parse_SOFT_line(l: str) -> dict: """ Parse SOFT formatted line, returning a dictionary with the key-value pair. :param str l: A SOFT-formatted line to parse ( !key = value ) :return dict[str, str]: A python Dict object representing the key-value. - :raise InvalidSoftLineException: if given line can't be parsed as SOFT line """ elems = l[1:].split("=") return {elems[0].rstrip(): elems[1].lstrip()} @@ -145,7 +136,7 @@ def parse_SOFT_line(l): class AccessionException(Exception): """Exceptional condition(s) dealing with accession number(s).""" - def __init__(self, reason=""): + def __init__(self, reason: str = ""): """ Optionally provide explanation for exceptional condition. @@ -181,18 +172,19 @@ def __init__(self, accn, strict=True): self.accn = accn self.typename = typename.upper() - def fetch_metadata(self, outpath=None, typename=None): + def fetch_metadata( + self, outpath: str = None, typename: str = None, clean: bool = False + ) -> list: """ Fetch the metadata associated with this accession. - :param str outpath: path to file to which to write output, optional :param str typename: type indicating URL format, use type parsed at construction if unspecified + :param str outpath: path to file to which to write output, optional + :param bool clean: if true, files won't be saved + :return: list of lines in soft file """ - # TODO: note this sort of type-dependent strategy suggests subclassing. - # For now, class is small, but that should maybe be done if it grows. - typename = (typename or self.typename).upper() if not is_known_type(typename=typename): raise self.accn_type_exception(self.accn, typename) @@ -210,7 +202,17 @@ def fetch_metadata(self, outpath=None, typename=None): raise _LOGGER.debug("Fetching: '%s'", full_url) - if outpath: + result = requests.get(full_url) + if result.ok: + result.encoding = "UTF-8" + result_text = result.text + result_list = result_text.replace("\r", "").split("\n") + result_list = [elem for elem in result_list if len(elem) > 0] + + else: + raise Exception(f"Error in requesting fileL: {full_url}") + + if outpath and not clean: # Ensure we have filepath and that needed directories exist. if not os.path.splitext(outpath)[1]: _LOGGER.debug("Looks like folder, not file: %s", outpath) @@ -222,15 +224,20 @@ def fetch_metadata(self, outpath=None, typename=None): if not os.path.exists(dirpath): _LOGGER.debug("Forging path to '%s'", dirpath) os.makedirs(dirpath) - cmd = "wget -O {} {}".format(outpath, full_url) - else: - cmd = "wget {}".format(full_url) - run_subprocess(cmd.split(" ")) + # save file: + with open(outpath, "w") as f: + f.write(result_text) + + return result_list @staticmethod - def _validate(accn): - """Determine if given value looks like an accession.""" + def _validate(accn: str): + """ + Determine if given value looks like an accession. + :param str accn: ordinary accession identifier. + :return: typename, number + """ typename, number = split_accn(accn) if len(typename) != 3: raise AccessionException( @@ -247,7 +254,7 @@ def _validate(accn): return typename, number @staticmethod - def accn_type_exception(accn, typename, include_known=True): + def accn_type_exception(accn: str, typename: str, include_known: bool = True): """ Create an exception instance based on an accession and a parsed unknown typename. @@ -265,7 +272,7 @@ def accn_type_exception(accn, typename, include_known=True): return AccessionException(message) -def split_accn(accn): +def split_accn(accn: str): """ Split accession into prefix and number, leaving suffix as text and converting the type prefix to uppercase. @@ -310,16 +317,19 @@ def clean_soft_files(meta_dir: str): and creating PEPs :param str meta_dir: Path to the metadata files """ - dir_files = os.listdir(meta_dir) + try: + dir_files = os.listdir(meta_dir) - for item in dir_files: - if ( - item.endswith(".soft") - or item.endswith("_file_list.txt") - or item.endswith("SRA.csv") - or item.endswith("SRA_filt.csv") - ): - os.remove(os.path.join(meta_dir, item)) + for item in dir_files: + if ( + item.endswith(".soft") + or item.endswith("_file_list.txt") + or item.endswith("SRA.csv") + or item.endswith("SRA_filt.csv") + ): + os.remove(os.path.join(meta_dir, item)) + except FileNotFoundError: + _LOGGER.debug("Can't clean soft files...folder doesn't exist") def run_subprocess(*args, **kwargs): @@ -333,5 +343,328 @@ def run_subprocess(*args, **kwargs): p.terminate() print("Pipeline aborted.") except OSError as ose: - _LOGGER.warn(f"Exception raised during subprocess termination: {ose}") + _LOGGER.warning(f"Exception raised during subprocess termination: {ose}") sys.exit(1) + + +def _get_list_of_keys(list_of_dict: list): + """ + Getting list of all keys that are in the dictionaries in the list + + :param list list_of_dict: list of dicts with metadata + :return list: list of dictionary keys + """ + + list_of_keys = [] + for element in list_of_dict: + list_of_keys.extend(list(element.keys())) + return list(set(list_of_keys)) + + +def _get_value(all_line: str): + """ + :param all_line: string with key value. (e.g. '!Series_geo_accession = GSE188720') + :return: value (e.g. GSE188720) + """ + line_value = all_line.split("= ")[-1] + return line_value.split(": ")[-1].rstrip("\n") + + +def _read_tar_filelist(raw_text: str) -> dict: + """ + Creating list for supplementary files that are listed in "filelist.txt" + :param str raw_text: path to the file with information about files that are zipped ("filelist.txt") + :return dict: dict of supplementary file names and additional information + """ + f = StringIO(raw_text) + files_info = {} + csv_reader = csv.reader(f, delimiter="\t") + line_count = 0 + for row in csv_reader: + if line_count == 0: + name_index = row.index("Name") + size_index = row.index("Size") + type_index = row.index("Type") + + line_count += 1 + else: + files_info[row[name_index]] = { + "file_size": row[size_index], + "type": row[type_index], + } + + return files_info + + +def _check_file_existance(meta_processed_sample: list) -> list: + """ + Checking if last element of the list has files. If list of files is empty deleting it + :param: meta_processed_sample: list with metadata dictionary + :return: list with metadata dictionary after processing + """ + nb = len(meta_processed_sample) - 1 + if nb > -1: + if len(meta_processed_sample[nb]["files"]) == 0: + del meta_processed_sample[nb] + nb -= 1 + return meta_processed_sample + + +def _separate_list_of_files(meta_list: Union[list, dict], col_name: str = "files"): + """ + This method is separating list of files (dict value) or just simple dict + into two different dicts + :param col_name: column name that should be added with filenames + :param meta_list: list, or dict with metadata + """ + separated_list = [] + if isinstance(meta_list, list): + for meta_elem in meta_list: + for file_elem in meta_elem[col_name]: + new_dict = meta_elem.copy() + new_dict.pop(col_name, None) + new_dict["file"] = file_elem + separated_list.append(new_dict) + elif isinstance(meta_list, dict): + for file_elem in meta_list[col_name]: + new_dict = meta_list.copy() + new_dict.pop(col_name, None) + new_dict["file"] = file_elem + separated_list.append(new_dict) + else: + return TypeError("Incorrect type") + + return separated_list + + +def _update_columns( + metadata: dict, experiment_name: str, sample_name: str, read_type: str +) -> dict: + """ + Update the metadata associated with a particular experiment. + + For the experiment indicated, this function updates the value (mapping), + including new data and populating columns used by looper based on + existing values in the mapping. + + :param Mapping metadata: the key-value mapping to update + :param str experiment_name: name of the experiment from which these + data came and are associated; the key in the metadata mapping + for which the value is to be updated + :param str sample_name: name of the sample with which these data are + associated + :param str read_type: usually "single" or "paired," an indication of the + type of sequencing reads for this experiment + :return: updated metadata + """ + + exp = metadata[experiment_name] + + # Protocol-agnostic + exp["sample_name"] = sample_name + exp["protocol"] = exp["Sample_library_selection"] + exp["read_type"] = read_type + exp["organism"] = exp["Sample_organism_ch1"] + exp["data_source"] = "SRA" + exp["SRX"] = experiment_name + + # Protocol specified is lowercased prior to checking here to alleviate + # dependence on case for the value in the annotations file. + bisulfite_protocols = {"reduced representation": "RRBS", "random": "WGBS"} + + # Conditional on bisulfite sequencing + # print(":" + exp["Sample_library_strategy"] + ":") + # Try to be smart about some library methods, refining protocol if possible. + if exp["Sample_library_strategy"] == "Bisulfite-Seq": + # print("Parsing protocol") + proto = exp["Sample_library_selection"].lower() + if proto in bisulfite_protocols: + exp["protocol"] = bisulfite_protocols[proto] + + return exp + + +def _sanitize_config_string(text: str) -> str: + """ + Function that sanitizes text in config file. + :param text: Any string that have to be sanitized + :return: sanitized strings + """ + new_str = text + new_str = new_str.replace('"', f'\\"') + new_str = new_str.replace("'", f"''") + return new_str + + +def _sanitize_name(name_str: str) -> str: + """ + Function that sanitizes strings. (Replace all odd characters) + :param str name_str: Any string value that has to be sanitized. + :return: sanitized strings + """ + new_str = name_str + punctuation1 = r"""!"#$%&'()*,./:;<=>?@[\]^_`{|}~""" + for odd_char in list(punctuation1): + new_str = new_str.replace(odd_char, "_") + new_str = new_str.replace(" ", "_").replace("__", "_") + return new_str + + +def _create_dot_yaml(file_path: str, yaml_path: str) -> NoReturn: + """ + Function that creates .pep.yaml file that points to actual yaml file + :param str file_path: Path to the .pep.yaml file that we want to create + :param str yaml_path: path or name of the actual yaml file + """ + with open(file_path, "w+") as file: + file.writelines(f"config_file: {yaml_path}") + + +def _which(program: str): + """ + return str: the path to a program to make sure it exists + """ + import os + + def is_exe(fp): + return os.path.isfile(fp) and os.access(fp, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + +def _dict_to_list_converter( + proj_dict: Dict = None, proj_list: List = None +) -> Union[Dict, List]: + """ + Converter project dict to list and vice versa + dict -> list + list -> dict + :param proj_dict: project dictionary + :param proj_list: project list + :return: converted values + """ + if proj_dict is not None: + new_meta_list = [] + for key in proj_dict: + new_dict = proj_dict[key] + new_dict["big_key"] = key + new_meta_list.append(new_dict) + + meta_list = new_meta_list + + elif proj_list is not None: + new_sample_dict = {} + for sample in proj_list: + new_sample_dict[sample["big_key"]] = sample + meta_list = new_sample_dict + + else: + raise ValueError + + return meta_list + + +def _standardize_colnames(meta_list: Union[list, dict]) -> Union[list, dict]: + """ + Standardize column names by lower-casing and underscore + :param list meta_list: list of dictionaries of samples + :return : list of dictionaries of samples with standard colnames + """ + # check if meta_list is dict and converting it to list + input_is_dict = False + if isinstance(meta_list, dict): + input_is_dict = True + meta_list = _dict_to_list_converter(proj_dict=meta_list) + + new_metalist = [] + list_keys = _get_list_of_keys(meta_list) + for item_nb, values in enumerate(meta_list): + new_metalist.append({}) + for key in list_keys: + try: + new_key_name = key.lower().strip() + new_key_name = _sanitize_name(new_key_name) + + new_metalist[item_nb][new_key_name] = values[key] + + except KeyError: + pass + + if input_is_dict: + new_metalist = _dict_to_list_converter(proj_list=new_metalist) + + return new_metalist + + +def _separate_file_url(meta_list): + """ + This method is adding dict key without file_name without path + """ + separated_list = [] + for meta_elem in meta_list: + new_dict = meta_elem.copy() + new_dict["file_url"] = meta_elem["file"] + new_dict["file"] = os.path.basename(meta_elem["file"]) + # new_dict["sample_name"] = os.path.basename(meta_elem["file"]) + try: + new_dict["sample_name"] = str(meta_elem["Sample_title"]) + if new_dict["sample_name"] == "" or new_dict["sample_name"] is None: + raise KeyError("sample_name Does not exist. Creating .. ") + except KeyError: + new_dict["sample_name"] = os.path.basename(meta_elem["file"]) + + # sanitize sample names + new_dict["sample_name"] = _sanitize_name(new_dict["sample_name"]) + + separated_list.append(new_dict) + return separated_list + + +def _filter_gsm(meta_processed_samples: list, gsm_list: dict) -> list: + """ + Getting metadata list of all samples of one experiment and filtering it + by the list of GSM that was specified in the input files. + And then changing names of the sample names. + + :param meta_processed_samples: list of metadata dicts of samples + :param gsm_list: list of dicts where GSM (samples) are keys and + sample names are values. Where values can be empty string + """ + + if gsm_list.keys(): + new_gsm_list = [] + for gsm_sample in meta_processed_samples: + if gsm_sample["Sample_geo_accession"] in gsm_list.keys(): + gsm_sample_new = gsm_sample + if gsm_list[gsm_sample["Sample_geo_accession"]] != "": + gsm_sample_new["sample_name"] = gsm_list[ + gsm_sample["Sample_geo_accession"] + ] + new_gsm_list.append(gsm_sample_new) + return new_gsm_list + return meta_processed_samples + + +def _unify_list_keys(processed_meta_list: list) -> list: + """ + Unifying list of dicts with metadata, so every dict will have + same keys + + :param list processed_meta_list: list of dicts with metadata + :return list: list of unified dicts with metadata + """ + list_of_keys = _get_list_of_keys(processed_meta_list) + for k in list_of_keys: + for list_elem in range(len(processed_meta_list)): + if k not in processed_meta_list[list_elem]: + processed_meta_list[list_elem][k] = "" + return processed_meta_list diff --git a/mkdocs.yml b/mkdocs.yml index c6c6549..dfc6c83 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,8 +9,11 @@ nav: - Introduction: README.md - Install and configure: install.md - SRA convert: sra_convert.md + - Tutorials: - Tutorial for processed data: processed-data-downloading.md - Tutorial for raw data: raw-data-downloading.md + - geofetch from within Python: python-usage.md + - GSE Finder: gse_finder.md - How-to Guides: - Specifying samples to download: file-specification.md - Set SRA data download location: howto-location.md diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index a38f1f0..f3ec919 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -7,3 +7,7 @@ logmuse>=0.2.7 ubiquerg>=0.6.0 requests>=2.28.1 xmltodict>=0.13.0 +pandas>=1.3.5 +peppy>=0.35.1 +rich>=12.5.1 +coloredlogs>=15.0.1 diff --git a/setup.py b/setup.py index e92f5b0..79cca69 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ def read_reqs(reqs_name): ], keywords="project, bioinformatics, sequencing, ngs, workflow, GUI", url="https://github.com/pepkit/{}/".format(PACKAGE), - author="Nathan Sheffield, Vince Reuter, Oleksandr Khoroshevskyi", + author="Oleksandr Khoroshevskyi, Nathan Sheffield, Vince Reuter, Nathan LeRoy", license="BSD2", entry_points={ "console_scripts": [ diff --git a/tests/test_geofetch.py b/tests/test_geofetch.py index f16b56f..ad8c923 100644 --- a/tests/test_geofetch.py +++ b/tests/test_geofetch.py @@ -1,3 +1,6 @@ +import peppy + +import geofetch from geofetch import parse_accessions, Geofetcher, utils import os import pytest @@ -15,6 +18,7 @@ def get_soft_path(gse_numb, sample_len, series_len): run test test_file_list """ return ( + gse_numb, os.path.join(GSE_FILES, gse_numb, GSE_SOFT_NAME), os.path.join(GSE_FILES, gse_numb, GSM_SOFT_NAME), sample_len, @@ -60,7 +64,7 @@ def test_accessions_file(self): class TestListProcessedMetaFiles: """ - Testing + Testing downloading and saving process soft files """ @pytest.fixture(scope="function") @@ -75,15 +79,23 @@ def initiate_geofetcher(self, tmpdir): yield instance @pytest.mark.parametrize( - "soft_gse, soft_gsm, sample_len, series_len", processed_meta_file_test + "gse_numb,soft_gse, soft_gsm, sample_len, series_len", processed_meta_file_test ) def test_file_list( - self, soft_gse, soft_gsm, sample_len, series_len, initiate_geofetcher + self, gse_numb, soft_gse, soft_gsm, sample_len, series_len, initiate_geofetcher ): + file_gse_content = geofetch.Accession(gse_numb).fetch_metadata( + soft_gse, typename="GSE", clean=False + ) + file_gsm_content = geofetch.Accession(gse_numb).fetch_metadata( + soft_gsm, typename="GSM", clean=False + ) ( meta_processed_samples, meta_processed_series, - ) = initiate_geofetcher.get_list_of_processed_files(soft_gse, soft_gsm) + ) = initiate_geofetcher._get_list_of_processed_files( + file_gse_content, file_gsm_content + ) assert len(meta_processed_samples) == sample_len assert len(meta_processed_series) == series_len @@ -97,22 +109,51 @@ def test_downloading_soft_files(self, initiate_geofetcher): assert "GSE138657_GSE.soft" in downloaded_meta_files def test_creating_sample_pep_files(self, initiate_geofetcher): - initiate_geofetcher.fetch_all("GSE138657") + gse_numb = "GSE138657" + initiate_geofetcher.fetch_all(gse_numb) + downloaded_meta_files = list( + os.walk(initiate_geofetcher.metadata_expanded + f"/{gse_numb}_samples") + )[0][2] + + assert f"{gse_numb}_samples.csv" in downloaded_meta_files + assert f"{gse_numb}_samples.yaml" in downloaded_meta_files + + def test_creating_series_pep_files(self, initiate_geofetcher): + gse_numb = "GSE199313" + initiate_geofetcher.fetch_all(gse_numb) downloaded_meta_files = list( - os.walk(initiate_geofetcher.metadata_expanded + "/PEP_samples") + os.walk(initiate_geofetcher.metadata_expanded + f"/{gse_numb}_series") )[0][2] - assert "GSE138657_samples.csv" in downloaded_meta_files - assert "GSE138657_samples.yaml" in downloaded_meta_files + assert f"{gse_numb}_series.csv" in downloaded_meta_files + assert f"{gse_numb}_series.yaml" in downloaded_meta_files + + +class TestListRawMetaFiles: + """ + Testing downloading and saving raw files and metadata + """ + + @pytest.fixture(scope="function") + def initiate_geofetcher(self, tmpdir): + instance = Geofetcher( + just_metadata=True, + processed=False, + name="test", + metadata_folder=tmpdir, + discard_soft=True, + ) + yield instance def test_creating_series_pep_files(self, initiate_geofetcher): - initiate_geofetcher.fetch_all("GSE199313") + initiate_geofetcher.fetch_all("GSE138656") downloaded_meta_files = list( - os.walk(initiate_geofetcher.metadata_expanded + "/PEP_series") + os.walk(initiate_geofetcher.metadata_expanded + f"/PEP") )[0][2] - assert "GSE199313_series.csv" in downloaded_meta_files - assert "GSE199313_series.yaml" in downloaded_meta_files + assert "PEP_raw.csv" in downloaded_meta_files + assert "PEP.yaml" in downloaded_meta_files + assert "PEP_raw_subtable.csv" in downloaded_meta_files class TestDownloadingProcFiles: @@ -137,7 +178,7 @@ def initiate_geofetcher(self, tmpdir): ], ) def test_downloading_files(self, file_url, file_name, tmpdir, initiate_geofetcher): - initiate_geofetcher.download_processed_file(file_url, tmpdir) + initiate_geofetcher._download_processed_file(file_url, tmpdir) assert len(tmpdir.listdir()) == 1 assert os.path.basename(tmpdir.listdir()[0]) == file_name @@ -155,7 +196,7 @@ def initiate_geofetcher(self, tmpdir): processed=True, name="test", metadata_folder=tmpdir, - filter="\.Bed.gz$", + filter=r"\.Bed.gz$", filter_size="2MB", ) yield instance @@ -178,7 +219,7 @@ def initiate_geofetcher(self, tmpdir): ], ) def test_filter(self, meta_list, output, initiate_geofetcher): - result = initiate_geofetcher.run_filter(meta_list) + result = initiate_geofetcher._run_filter(meta_list) assert result == output @pytest.mark.parametrize( @@ -199,7 +240,7 @@ def test_filter(self, meta_list, output, initiate_geofetcher): ], ) def test_size_filter(self, meta_list, output, initiate_geofetcher): - result = initiate_geofetcher.run_size_filter(meta_list) + result = initiate_geofetcher._run_size_filter(meta_list) assert result == output @pytest.mark.parametrize( @@ -248,11 +289,73 @@ def test_size_filter(self, meta_list, output, initiate_geofetcher): def test_large_meta_separation( self, init_meta_data, result_sample, result_proj, initiate_geofetcher ): - samp, proj = initiate_geofetcher.separate_common_meta(init_meta_data, max_len=0) + samp, proj = initiate_geofetcher._separate_common_meta( + init_meta_data, max_len=0 + ) assert samp == result_sample assert proj == result_proj +class TestPeppyInitProcessed: + """ + Testing downloading and saving raw files and metadata + """ + + @pytest.fixture(scope="function") + def initiate_geofetcher(self, tmpdir): + instance = Geofetcher( + just_metadata=True, + processed=True, + name="test", + metadata_folder=tmpdir, + discard_soft=True, + data_source="all", + ) + yield instance + + def test_creating_processed_peppy(self, initiate_geofetcher): + gse_numb = "GSE190287" + p_prop = initiate_geofetcher.get_projects(gse_numb) + assert isinstance(p_prop[f"{gse_numb}_samples"], peppy.Project) + assert isinstance(p_prop[f"{gse_numb}_series"], peppy.Project) + + def test_number_of_samples(self, initiate_geofetcher): + gse_numb = "GSE190287" + p_prop = initiate_geofetcher.get_projects(gse_numb) + assert ( + len(p_prop[f"{gse_numb}_samples"].samples) == 8 + ) # it has 11 files but 8 samples + assert len(p_prop[f"{gse_numb}_series"].samples) == 2 + + +class TestPeppyInitRaw: + """ + Testing downloading and saving raw files and metadata + """ + + @pytest.fixture(scope="function") + def initiate_geofetcher(self, tmpdir): + instance = Geofetcher( + just_metadata=True, + processed=False, + name="test", + metadata_folder=tmpdir, + discard_soft=True, + ) + yield instance + + def test_creating_processed_peppy(self, initiate_geofetcher): + gse_numb = "GSE189141" + p_prop = initiate_geofetcher.get_projects(gse_numb) + assert isinstance(p_prop[f"{gse_numb}_raw"], peppy.Project) + + def test_number_of_samples(self, initiate_geofetcher): + gse_numb = "GSE189141" + p_prop = initiate_geofetcher.get_projects(gse_numb) + a = [d["sample_name"] for d in p_prop[f"{gse_numb}_raw"].samples] + assert len(p_prop[f"{gse_numb}_raw"].samples) == 16 # it has 16 samples + + def test_clean_func(tmpdir): """ Testing deleting soft files