From 88565ae623b1ac58630158129e93d7af148bb9aa Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 20 Jan 2023 15:35:34 -0500 Subject: [PATCH 1/5] Added saving experiment metadata functionality --- geofetch/config_processed_template.yaml | 2 + geofetch/config_template.yaml | 2 + geofetch/geofetch.py | 81 +++++++++++++++++++++---- geofetch/utils.py | 22 +++++++ 4 files changed, 94 insertions(+), 13 deletions(-) diff --git a/geofetch/config_processed_template.yaml b/geofetch/config_processed_template.yaml index 1198863..cd7621e 100644 --- a/geofetch/config_processed_template.yaml +++ b/geofetch/config_processed_template.yaml @@ -4,6 +4,8 @@ pep_version: 2.1.0 project_name: {project_name} sample_table: {sample_table} +{project_metadata} + sample_modifiers: append: output_file_path: FILES diff --git a/geofetch/config_template.yaml b/geofetch/config_template.yaml index 588d81a..40a19b1 100644 --- a/geofetch/config_template.yaml +++ b/geofetch/config_template.yaml @@ -3,6 +3,8 @@ name: {project_name} pep_version: 2.1.0 sample_table: {annotation} + +{project_metadata} {subannotation} {sample_modifier_str} diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 41e14ca..1f4b72f 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -39,6 +39,7 @@ _separate_file_url, _filter_gsm, _unify_list_keys, + gse_content_to_dict, ) from rich.progress import track @@ -416,6 +417,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje file_gse_content = gse_file_obj.read().split("\n") file_gse_content = [elem for elem in file_gse_content if len(elem) > 0] + file_gse_content_dict = gse_content_to_dict(file_gse_content) + if not os.path.isfile(file_gsm) or self.refresh_metadata: file_gsm_content = Accession(acc_GSE).fetch_metadata( file_gsm, @@ -453,7 +456,10 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje # generating PEPs for processed files: if self.acc_anno: self._generate_processed_meta( - acc_GSE, meta_processed_samples, meta_processed_series + acc_GSE, + meta_processed_samples, + meta_processed_series, + gse_meta_dict=file_gse_content_dict, ) else: @@ -494,11 +500,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje # save one project if self.acc_anno and nkeys > 1: - self._write_raw_annotation_new( - name=acc_GSE, - metadata_dict=gsm_metadata, - subannot_dict=gsm_multi_table, - ) + self._write_raw_annotation_new(name=acc_GSE, metadata_dict=gsm_metadata, + subannot_dict=gsm_multi_table, gse_meta_dict=file_gse_content_dict) else: metadata_dict_combined.update(gsm_metadata) @@ -520,6 +523,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje name=self.project_name, meta_processed_samples=processed_metadata_samples, meta_processed_series=processed_metadata_series, + gse_meta_dict=file_gse_content_dict if len(acc_GSE_list.keys()) == 1 else None, ) if self.just_object: return return_value @@ -530,6 +534,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje f"{self.project_name}_PEP", metadata_dict_combined, subannotation_dict_combined, + gse_meta_dict=file_gse_content_dict if len(acc_GSE_list.keys()) == 1 else None, ) if self.just_object: return return_value @@ -706,7 +711,11 @@ def fetch_processed_one( return meta_processed_samples, meta_processed_series def _generate_processed_meta( - self, name: str, meta_processed_samples: list, meta_processed_series: list + self, + name: str, + meta_processed_samples: list, + meta_processed_series: list, + gse_meta_dict: Union[dict, None] = None, ) -> dict: """ Generate and save PEPs for processed accessions. GEO has data in GSE and GSM, @@ -714,6 +723,8 @@ def _generate_processed_meta( :param name: name of the folder/file where PEP will be saved :param meta_processed_samples: :param meta_processed_series: + :param gse_meta_dict: dict of metadata fetched from one experiment. + Used to add this data to config file. :return: dict of objects if just_object is set, otherwise dicts of None """ return_objects = {f"{name}_samples": None, f"{name}_series": None} @@ -729,6 +740,7 @@ def _generate_processed_meta( meta_processed_samples, pep_acc_path_sample, just_object=self.just_object, + gse_meta_dict=gse_meta_dict, ) # series @@ -753,6 +765,7 @@ def _generate_processed_meta( meta_processed_samples, pep_acc_path_sample, just_object=self.just_object, + gse_meta_dict=gse_meta_dict, ) elif self.supp_by == "series": return_objects[f"{name}_series"] = pep_acc_path_exp = os.path.join( @@ -957,12 +970,15 @@ def _write_processed_annotation( processed_metadata: list, file_annotation_path: str, just_object: bool = False, + gse_meta_dict: dict = None, ) -> Union[NoReturn, peppy.Project]: """ Save annotation file by providing list of dictionaries with files metadata :param list processed_metadata: list of dictionaries with files metadata :param str file_annotation_path: the path to the metadata file that has to be saved - :type just_object: True, if you want to get peppy object without saving file + :param just_object: True, if you want to get peppy object without saving file + :param gse_meta_dict: dict of metadata fetched from one experiment. + Used to add this data to config file. :return: none, or peppy project """ if len(processed_metadata) == 0: @@ -991,7 +1007,7 @@ def _write_processed_annotation( self.attr_limit_truncate, ) - template = self._create_config_processed(file_annotation_path, proj_meta) + template = self._create_config_processed(file_annotation_path, proj_meta, meta_in_series=gse_meta_dict) if not just_object: with open(file_annotation_path, "w") as m_file: @@ -1044,7 +1060,11 @@ def _find_genome(metadata_list: list) -> list: return metadata_list def _write_raw_annotation_new( - self, name, metadata_dict: dict, subannot_dict: dict = None + self, + name, + metadata_dict: dict, + subannot_dict: dict = None, + gse_meta_dict: dict = None, ) -> Union[None, peppy.Project]: """ Combine individual accessions into project-level annotations, and writing @@ -1052,6 +1072,7 @@ def _write_raw_annotation_new( :param name: Name of the run, project, or acc --> will influence name of the folder where project will be created :param metadata_dict: dictionary of sample annotations :param subannot_dict: dictionary of subsample annotations + :param gse_meta_dict: dict of experiment metadata that was sotred in gse :return: none or peppy object """ try: @@ -1101,7 +1122,7 @@ def _write_raw_annotation_new( subanot_path_yaml = f"" template = self._create_config_raw( - proj_meta, proj_root_sample, subanot_path_yaml + proj_meta, proj_root_sample, subanot_path_yaml, gse_meta_dict ) if not self.just_object: @@ -1137,12 +1158,16 @@ def _write_raw_annotation_new( return proj def _create_config_processed( - self, file_annotation_path: str, proj_meta: list + self, + file_annotation_path: str, + proj_meta: list, + meta_in_series: dict = True, ) -> str: """ Compose and generate config file content :param file_annotation_path: root to the annotation file :param proj_meta: common metadata that has to added to config file + :param meta_in_series: :return: generated, complete config file content """ geofetchdir = os.path.dirname(__file__) @@ -1154,6 +1179,17 @@ def _create_config_processed( for i in proj_meta ] modifiers_str = "\n ".join(d for d in meta_list_str) + + # series metadata + if not meta_in_series: + project_metadata = '' + else: + meta_list_str = { + i: j + for i, j in meta_in_series.items() + } + project_metadata = yaml.dump(meta_list_str, default_style='"') + template_values = { "project_name": self.project_name, "sample_table": os.path.basename(file_annotation_path), @@ -1161,18 +1197,25 @@ def _create_config_processed( "pipeline_samples": self.file_pipeline_samples, "pipeline_project": self.file_pipeline_project, "additional_columns": modifiers_str, + "project_metadata": project_metadata, } for k, v in template_values.items(): placeholder = "{" + str(k) + "}" template = template.replace(placeholder, str(v)) return template - def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml): + def _create_config_raw( + self, + proj_meta, + proj_root_sample, + subanot_path_yaml, + meta_in_series=None): """ Compose and generate config file content for raw data :param proj_meta: root to the annotation file :param proj_root_sample: path to sampletable file :param subanot_path_yaml: path to subannotation file + :param meta_in_series: :return: generated, complete config file content """ meta_list_str = [ @@ -1195,6 +1238,17 @@ def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml): sra_convert_template = template_file.read() else: sra_convert_template = "" + + # series metadata + if not meta_in_series: + project_metadata = '' + else: + meta_list_str = { + i: j + for i, j in meta_in_series.items() + } + project_metadata = yaml.dump(meta_list_str, default_style='"') + with open(self.config_template, "r") as template_file: template = template_file.read() template_values = { @@ -1206,6 +1260,7 @@ def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml): "pipeline_project": self.file_pipeline_project, "additional_columns": modifiers_str, "sra_convert": sra_convert_template, + "project_metadata": project_metadata, } for k, v in template_values.items(): placeholder = "{" + str(k) + "}" diff --git a/geofetch/utils.py b/geofetch/utils.py index 7cd2aaa..b07091a 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -722,3 +722,25 @@ def _unify_list_keys(processed_meta_list: list) -> list: if k not in processed_meta_list[list_elem]: processed_meta_list[list_elem][k] = "" return processed_meta_list + + +def gse_content_to_dict(gse_content: List[str]) -> Dict[str, dict]: + """ + Unpack gse soft file to dict + :param gse_content: list of strings of gse soft file + :return: dict of gse content + """ + gse_dict = {} + for line in gse_content: + if line.startswith("^"): + pass + elif line.startswith("!"): + key_value = line.split(" = ") + new_key = _sanitize_name(key_value[0][1:]) + new_value = _sanitize_config_string(" ".join(key_value[1:])) + if new_key in gse_dict.keys(): + gse_dict[new_key] = f"{gse_dict[new_key]} + {new_value}" + else: + gse_dict[new_key] = new_value + + return {"experiment_metadata": gse_dict} From d14620543eb184b9592ba17e62f8dfcf31477a96 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 20 Jan 2023 15:37:00 -0500 Subject: [PATCH 2/5] added changelog --- docs/changelog.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 16c3f43..a5e6397 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,8 @@ # Changelog +## [0.11.3] -- 2023-01-20 +- Added functionality that saves gse metadata to config file + ## [0.11.2] -- 2022-12-25 - Changed sample_name of PEP of processed files to file oriented - Added `--max-soft-size` argument, that sets size limit of soft files From efa2a2ede3e40a7616f5b145cadc1efd99b27471 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 20 Jan 2023 17:37:42 -0500 Subject: [PATCH 3/5] version + Fixed #108 --- geofetch/_version.py | 2 +- requirements/requirements-all.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/geofetch/_version.py b/geofetch/_version.py index e2bd072..1bebb74 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.11.2" +__version__ = "0.11.3" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 77a8ee1..854b024 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,7 +4,7 @@ logmuse>=0.2.6 ubiquerg>=0.6.2 requests>=2.28.1 xmltodict>=0.13.0 -pandas>=1.3.5 +pandas>=1.5.3 peppy>=0.35.3 rich>=12.5.1 coloredlogs>=15.0.1 From ae1321403e50e632445665c36d5e011dbd42aac7 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 20 Jan 2023 17:40:36 -0500 Subject: [PATCH 4/5] lint --- geofetch/geofetch.py | 51 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 1f4b72f..d36a491 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -500,8 +500,12 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje # save one project if self.acc_anno and nkeys > 1: - self._write_raw_annotation_new(name=acc_GSE, metadata_dict=gsm_metadata, - subannot_dict=gsm_multi_table, gse_meta_dict=file_gse_content_dict) + self._write_raw_annotation_new( + name=acc_GSE, + metadata_dict=gsm_metadata, + subannot_dict=gsm_multi_table, + gse_meta_dict=file_gse_content_dict, + ) else: metadata_dict_combined.update(gsm_metadata) @@ -523,7 +527,9 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje name=self.project_name, meta_processed_samples=processed_metadata_samples, meta_processed_series=processed_metadata_series, - gse_meta_dict=file_gse_content_dict if len(acc_GSE_list.keys()) == 1 else None, + gse_meta_dict=file_gse_content_dict + if len(acc_GSE_list.keys()) == 1 + else None, ) if self.just_object: return return_value @@ -534,7 +540,9 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje f"{self.project_name}_PEP", metadata_dict_combined, subannotation_dict_combined, - gse_meta_dict=file_gse_content_dict if len(acc_GSE_list.keys()) == 1 else None, + gse_meta_dict=file_gse_content_dict + if len(acc_GSE_list.keys()) == 1 + else None, ) if self.just_object: return return_value @@ -1007,7 +1015,9 @@ def _write_processed_annotation( self.attr_limit_truncate, ) - template = self._create_config_processed(file_annotation_path, proj_meta, meta_in_series=gse_meta_dict) + template = self._create_config_processed( + file_annotation_path, proj_meta, meta_in_series=gse_meta_dict + ) if not just_object: with open(file_annotation_path, "w") as m_file: @@ -1060,11 +1070,11 @@ def _find_genome(metadata_list: list) -> list: return metadata_list def _write_raw_annotation_new( - self, - name, - metadata_dict: dict, - subannot_dict: dict = None, - gse_meta_dict: dict = None, + self, + name, + metadata_dict: dict, + subannot_dict: dict = None, + gse_meta_dict: dict = None, ) -> Union[None, peppy.Project]: """ Combine individual accessions into project-level annotations, and writing @@ -1182,12 +1192,9 @@ def _create_config_processed( # series metadata if not meta_in_series: - project_metadata = '' + project_metadata = "" else: - meta_list_str = { - i: j - for i, j in meta_in_series.items() - } + meta_list_str = {i: j for i, j in meta_in_series.items()} project_metadata = yaml.dump(meta_list_str, default_style='"') template_values = { @@ -1205,11 +1212,8 @@ def _create_config_processed( return template def _create_config_raw( - self, - proj_meta, - proj_root_sample, - subanot_path_yaml, - meta_in_series=None): + self, proj_meta, proj_root_sample, subanot_path_yaml, meta_in_series=None + ): """ Compose and generate config file content for raw data :param proj_meta: root to the annotation file @@ -1241,12 +1245,9 @@ def _create_config_raw( # series metadata if not meta_in_series: - project_metadata = '' + project_metadata = "" else: - meta_list_str = { - i: j - for i, j in meta_in_series.items() - } + meta_list_str = {i: j for i, j in meta_in_series.items()} project_metadata = yaml.dump(meta_list_str, default_style='"') with open(self.config_template, "r") as template_file: From 4ff185e4dd078dddcad31758eb2026f9beda5d6e Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 23 Jan 2023 11:51:11 -0500 Subject: [PATCH 5/5] changed version --- docs/changelog.md | 2 +- geofetch/_version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index a5e6397..1e4ba1a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,6 @@ # Changelog -## [0.11.3] -- 2023-01-20 +## [0.12.0] -- 2023-01-23 - Added functionality that saves gse metadata to config file ## [0.11.2] -- 2022-12-25 diff --git a/geofetch/_version.py b/geofetch/_version.py index 1bebb74..ea370a8 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.11.3" +__version__ = "0.12.0"