diff --git a/relecov_tools/ena_upload.py b/relecov_tools/ena_upload.py new file mode 100644 index 00000000..62b60c2a --- /dev/null +++ b/relecov_tools/ena_upload.py @@ -0,0 +1,149 @@ +import os +import logging +import rich.console +from email import utils +import json as j +import xml.etree.cElementTree as e + +import relecov_tools.utils + +log = logging.getLogger(__name__) +stderr = rich.console.Console( + stderr=True, + style="dim", + highlight=False, + force_terminal=relecov_tools.utils.rich_force_colors(), +) + + +class XmlCreation: + def __init__(self, source_json=None, output_path=None, action=None): + if source_json is None: + self.source_json = utils.prompt_source_path() + else: + self.source_json = source_json + if output_path is None: + self.output_path = utils.prompt_destination_path() + else: + self.output_path = output_path + if action is None: + self.action = "ADD" + else: + self.action = action + + def xml_study( + self, + ): + """ + 1.From validated json to xml study- submission.xml and project.xml + 1.1 Upload study info + + 2. From validated json to xml samples - submission.xml and samples.xml + 2.2 Upload samples info + + 3. From sftp upload runs (FASTQ files programmatic)- experiments.xmlm, runs.xml and submission.xml + 4. From sftp upload sequences (FASTA files programmatic) - json using webin-cli-rest + """ + + # Load validated json + with open(self.source_json) as json_format_file: + json_data = j.load(json_format_file) + + # Create output directory + + try: + # Create target Directory + os.mkdir(self.output_path) + print("Directory ", self.output_path, " Created ") + except FileExistsError: + print("Directory ", self.output_path, " already exists") + + # 1. From validated json to xml study- submission.xml and project.xml + + # submission.xml + os.chdir("xml_files/") + if self.action.upper == "ADD": + # submission add + submission_file = "submission_add.xml" + if self.action.upper() == "MODIFY": + # submission modify + submission_file = "submission_modify.xml" + + # project_relecov.xml + os.chdir("../conf") + dict_conf = j.loads("configuration.json") + r = e.Element("PROJECT_SET") + project = e.SubElement(r, "PROJECT") + project.set("alias", dict_conf["project_relecov_xml"]["alias"]) + e.SubElement(project, "TITLE").text = dict_conf["project_relecov_xml"][ + "TITLE" + ] + e.SubElement(project, "DESCRIPTION").text = dict_conf[ + "project_relecov_xml" + ]["DESCRIPTION"] + submission = e.SubElement(project, "SUBMISSION_PROJECT") + e.SubElement(submission, "SEQUENCING_PROJECT") + a = e.ElementTree(r) + a.write(os.path.join(self.output_path, "study", "project_relecov.xml")) + + # 1.1 Upload study info + # 2. From validated json to xml samples - submission.xml and samples.xml + + def xml_samples(): + # submission.xml + os.chdir("../xml_files/") + if self.action.upper == "ADD": + # submission add + submission_file = "submission_add.xml" + if self.action.upper() == "MODIFY": + # submission modify + submission_file = "submission_modify.xml" + + # samples_relecov.xml + os.chdir("../schema/") + json_data = j.loads("to_ena.json") + os.chdir("../conf") + dict_conf = j.loads("configuration.json") + + data_keys = list(json_data.keys()) + r = e.Element("SAMPLE_SET") + sample = e.SubElement(r, "SAMPLE") + sample.set( + "alias", + "Programmatic Test SARS-CoV-2 Sample" + str(json_data["sample_name"]), + ) + e.SubElement(sample, "TITLE").text = "SARS-CoV-2 Sample" + str( + json_data["sample_name"] + ) + sample_name = e.SubElement(sample, "SAMPLE_NAME") + e.SubElement(sample_name, "TAXON_ID").text = dict_conf["fixed_data"][ + "tax_id" + ] + e.SubElement(sample_name, "SCIENTIFIC_NAME").text = dict_conf["fixed_data"][ + "scientific_name" + ] + e.SubElement(sample, "DESCRIPTION").text = "SARS-CoV-2 Sample" + str( + json_data["sample_name"] + ) + sample_attributes = e.SubElement(sample, "SAMPLE_ATTRIBUTES") + for i in json_data: + sample_attribute = e.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") + e.SubElement(sample_attribute, "TAG").text = str(i) + e.SubElement(sample_attribute, "VALUE").text = json_data[i] + a = e.ElementTree(r) + a.write(os.path.join(self.output_path, "samples", "samples_relecov.xml")) + + # 2.2 Upload samples info + + # 3. From sftp upload runs (FASTQ files programmatic)- experiments.xmlm, runs.xml and submission.xml + # 4. From sftp upload sequences (FASTA files programmatic) - json using webin-cli-rest + + +# Adaptation to ena_upload +with open('../example_data/ena_upload.json','r') as f: + data = j.loads(f.read()) + +df_study = pd.DataFrame.from_dict(data["study"]) +df_samples = pd.DataFrame.from_dict(data["samples"]) +df_runs = pd.DataFrame.from_dict(data["runs"]) +df_experiments = pd.DataFrame.from_dict(data["experiments"]) \ No newline at end of file diff --git a/relecov_tools/ena_upload_buisciii.py b/relecov_tools/ena_upload_buisciii.py new file mode 100644 index 00000000..226f04a3 --- /dev/null +++ b/relecov_tools/ena_upload_buisciii.py @@ -0,0 +1,174 @@ +import os +import logging +import rich.console +from email import utils +import json as j +import xml.etree.cElementTree as e + +import relecov_tools.utils + +log = logging.getLogger(__name__) +stderr = rich.console.Console( + stderr=True, + style="dim", + highlight=False, + force_terminal=relecov_tools.utils.rich_force_colors(), +) + + +class XmlCreation: + def __init__(self, source_json=None, output_path=None, action=None): + if source_json is None: + self.source_json = utils.prompt_source_path() + else: + self.source_json = source_json + if output_path is None: + self.output_path = utils.prompt_destination_path() + else: + self.output_path = output_path + if action is None: + self.action = "ADD" + else: + self.action = action + + def xml_study( + self, + ): + """ + 1.From validated json to xml study- submission.xml and project.xml + 1.1 Upload study info + + 2. From validated json to xml samples - submission.xml and samples.xml + 2.2 Upload samples info + + 3. From sftp upload runs (FASTQ files programmatic)- experiments.xmlm, runs.xml and submission.xml + 4. From sftp upload sequences (FASTA files programmatic) - json using webin-cli-rest + """ + + # Load validated json + with open(self.source_json) as json_format_file: + json_data = j.load(json_format_file) + + # Create output directory + + try: + # Create target Directory + os.mkdir(self.output_path) + print("Directory ", self.output_path, " Created ") + except FileExistsError: + print("Directory ", self.output_path, " already exists") + + # 1. From validated json to xml study- submission.xml and project.xml + + # submission.xml + os.chdir("xml_files/") + if self.action.upper == "ADD": + # submission add + submission_file = "submission_add.xml" + if self.action.upper() == "MODIFY": + # submission modify + submission_file = "submission_modify.xml" + + # project_relecov.xml + os.chdir("../conf") + dict_conf = j.loads("configuration.json") + r = e.Element("PROJECT_SET") + project = e.SubElement(r, "PROJECT") + project.set("alias", dict_conf["project_relecov_xml"]["alias"]) + e.SubElement(project, "TITLE").text = dict_conf["project_relecov_xml"][ + "TITLE" + ] + e.SubElement(project, "DESCRIPTION").text = dict_conf[ + "project_relecov_xml" + ]["DESCRIPTION"] + submission = e.SubElement(project, "SUBMISSION_PROJECT") + e.SubElement(submission, "SEQUENCING_PROJECT") + a = e.ElementTree(r) + a.write(os.path.join(self.output_path, "study", "project_relecov.xml")) + + # 1.1 Upload study info + """ + import requests + from requests.structures import CaseInsensitiveDict + + url = "https://reqbin.com/echo/post/json" + + headers = CaseInsensitiveDict() + headers["Content-Type"] = "application/json" + headers["Authorization"] = "Basic bG9naW46cGFzc3dvcmQ=" + + data = '{"login":"my_login","password":"my_password"}' + + + resp = requests.post(url, headers=headers, data=data) + + print(resp.status_code) + """ + + # 2. From validated json to xml samples - submission.xml and samples.xml + + def xml_samples(): + # submission.xml + os.chdir("../xml_files/") + if self.action.upper == "ADD": + # submission add + submission_file = "submission_add.xml" + if self.action.upper() == "MODIFY": + # submission modify + submission_file = "submission_modify.xml" + + # samples_relecov.xml + os.chdir("../schema/") + json_data = j.loads("to_ena.json") + os.chdir("../conf") + dict_conf = j.loads("configuration.json") + + data_keys = list(json_data.keys()) + r = e.Element("SAMPLE_SET") + sample = e.SubElement(r, "SAMPLE") + sample.set( + "alias", + "Programmatic Test SARS-CoV-2 Sample" + str(json_data["sample_name"]), + ) + e.SubElement(sample, "TITLE").text = "SARS-CoV-2 Sample" + str( + json_data["sample_name"] + ) + sample_name = e.SubElement(sample, "SAMPLE_NAME") + e.SubElement(sample_name, "TAXON_ID").text = dict_conf["fixed_data"][ + "tax_id" + ] + e.SubElement(sample_name, "SCIENTIFIC_NAME").text = dict_conf["fixed_data"][ + "scientific_name" + ] + e.SubElement(sample, "DESCRIPTION").text = "SARS-CoV-2 Sample" + str( + json_data["sample_name"] + ) + sample_attributes = e.SubElement(sample, "SAMPLE_ATTRIBUTES") + for i in json_data: + sample_attribute = e.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") + e.SubElement(sample_attribute, "TAG").text = str(i) + e.SubElement(sample_attribute, "VALUE").text = json_data[i] + a = e.ElementTree(r) + a.write(os.path.join(self.output_path, "samples", "samples_relecov.xml")) + + # 2.2 Upload samples info + """ + import requests + from requests.structures import CaseInsensitiveDict + + url = "https://reqbin.com/echo/post/json" + + headers = CaseInsensitiveDict() + headers["Content-Type"] = "application/json" + headers["Authorization"] = "Basic bG9naW46cGFzc3dvcmQ=" + + data = '{"login":"my_login","password":"my_password"}' + + + resp = requests.post(url, headers=headers, data=data) + + print(resp.status_code) + """ + + # 3. From sftp upload runs (FASTQ files programmatic)- experiments.xmlm, runs.xml and submission.xml + # 4. From sftp upload sequences (FASTA files programmatic) - json using webin-cli-rest diff --git a/relecov_tools/example_data/ena_upload.json b/relecov_tools/example_data/ena_upload.json index e6689ca8..92aaefb9 100644 --- a/relecov_tools/example_data/ena_upload.json +++ b/relecov_tools/example_data/ena_upload.json @@ -16,7 +16,7 @@ "study_abstract_1", "study_abstract_2" ], - "pubmed_id": [None, + "pubmed_id": ["None", "pubmed_id_2" ] }, @@ -157,7 +157,7 @@ "single" ], "insert_size": [ - "250",None,None + "250","None","None" ], "library_construction_protocol": [ "library_construction_protocol_1", diff --git a/relecov_tools/test/try_01.ipynb b/relecov_tools/test/try_01.ipynb new file mode 100644 index 00000000..df0f0e80 --- /dev/null +++ b/relecov_tools/test/try_01.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 25, + "id": "aef6b5c1", + "metadata": {}, + "outputs": [], + "source": [ + "#json to dataframe\n", + "import pandas as pd\n", + "\n", + "df = pd.read_json('../example_data/ena_upload_study.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8cf882d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
study
alias[study_alias_4, study_alias_5]
\n", + "
" + ], + "text/plain": [ + " study\n", + "alias [study_alias_4, study_alias_5]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "95cf4dd2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2,)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lista = data[\"study\"][\"alias\"]\n", + "import numpy as np\n", + "a = np.array(lista)\n", + "a.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "f1a7dcf1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'study': {'alias': ['study_alias_4', 'study_alias_5']}}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "6f6c7f87", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "# load data using Python JSON module\n", + "with open('../example_data/ena_upload.json','r') as f:\n", + " data = json.loads(f.read())\n", + "\n", + "lista = data[\"study\"]\n", + "df_study = pd.DataFrame(lista, columns=['alias',\"title\",\"study_type\",\"study_abstract\",\"pubmed_id\"])\n", + "\n", + "#study_df pd.DataFrame(columns=[\"alias\"], )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "a8066308", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'alias': ['study_alias_4', 'study_alias_5'],\n", + " 'title': ['study_title_1', 'study_title_2'],\n", + " 'study_type': ['Transcriptome Analysis', 'RNASeq'],\n", + " 'study_abstract': ['study_abstract_1', 'study_abstract_2'],\n", + " 'pubmed_id': ['None', 'pubmed_id_2']}" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lista" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "5dcb05c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
aliastitlestudy_typestudy_abstractpubmed_id
0study_alias_4study_title_1Transcriptome Analysisstudy_abstract_1None
1study_alias_5study_title_2RNASeqstudy_abstract_2pubmed_id_2
\n", + "
" + ], + "text/plain": [ + " alias title study_type study_abstract \\\n", + "0 study_alias_4 study_title_1 Transcriptome Analysis study_abstract_1 \n", + "1 study_alias_5 study_title_2 RNASeq study_abstract_2 \n", + "\n", + " pubmed_id \n", + "0 None \n", + "1 pubmed_id_2 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_study" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "308e5c21", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}